diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
new file mode 100644
index 000000000..bae5188fa
--- /dev/null
+++ b/.github/workflows/integration-tests.yml
@@ -0,0 +1,80 @@
+name: Integration tests
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  ollama:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install Ollama
+        run: |
+          curl -fsSL https://ollama.com/install.sh | sh
+
+      - name: Pull Ollama image
+        run: |
+          ollama pull llama3.2:3b-instruct-fp16
+
+      - name: Start Ollama in background
+        run: |
+          nohup ollama run llama3.2:3b-instruct-fp16 > ollama.log 2>&1 &
+
+      - name: Set Up Environment and Install Dependencies
+        run: |
+          uv sync --extra dev --extra test
+          uv pip install ollama faiss-cpu
+          uv pip install -e .
+
+      - name: Wait for Ollama to start
+        run: |
+          echo "Waiting for Ollama..."
+          for i in {1..30}; do
+            if curl -s http://localhost:11434 | grep -q "Ollama is running"; then
+              echo "Ollama is running!"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "Ollama failed to start"
+          ollama ps
+          ollama.log
+          exit 1
+
+      - name: Start Llama Stack server in background
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          source .venv/bin/activate
+          # TODO: use "llama stack run"
+          nohup uv run python -m llama_stack.distribution.server.server --yaml-config ./llama_stack/templates/ollama/run.yaml > server.log 2>&1 &
+
+      - name: Wait for Llama Stack server to be ready
+        run: |
+          echo "Waiting for Llama Stack server..."
+          for i in {1..30}; do
+            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
+              echo " Llama Stack server is up!"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo " Llama Stack server failed to start"
+          cat server.log
+          exit 1
+
+      - name: Run Inference Integration Tests
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+        run: |
+          uv run pytest -v tests/integration/inference --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
diff --git a/.github/workflows/providers-build.yml b/.github/workflows/providers-build.yml
new file mode 100644
index 000000000..be4298a98
--- /dev/null
+++ b/.github/workflows/providers-build.yml
@@ -0,0 +1,76 @@
+name: Test Llama Stack Build
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'llama_stack/cli/stack/build.py'
+      - 'llama_stack/cli/stack/_build.py'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
+      - '.github/workflows/providers-build.yml'
+  pull_request:
+    paths:
+      - 'llama_stack/cli/stack/build.py'
+      - 'llama_stack/cli/stack/_build.py'
+      - 'llama_stack/distribution/build.*'
+      - 'llama_stack/distribution/*.sh'
+      - '.github/workflows/providers-build.yml'
+
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      templates: ${{ steps.set-matrix.outputs.templates }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Generate Template List
+        id: set-matrix
+        run: |
+          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          echo "templates=$templates" >> "$GITHUB_OUTPUT"
+
+  build:
+    needs: generate-matrix
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
+        image-type: [venv, container]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install LlamaStack
+        run: |
+          uv venv
+          source .venv/bin/activate
+          uv pip install -e .
+
+      - name: Print build dependencies
+        run: |
+          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
+
+      - name: Run Llama Stack Build
+        run: |
+          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
+
+      - name: Print dependencies in the image
+        if: matrix.image-type == 'venv'
+        run: |
+          source test/bin/activate
+          uv pip list
diff --git a/.github/workflows/stale_bot.yml b/.github/workflows/stale_bot.yml
new file mode 100644
index 000000000..2039fcbb4
--- /dev/null
+++ b/.github/workflows/stale_bot.yml
@@ -0,0 +1,45 @@
+name: Close stale issues and PRs
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # every day at midnight
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  stale:
+    permissions:
+      issues: write
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stale Action
+        uses: actions/stale@v9
+        with:
+          stale-issue-label: 'stale'
+          stale-issue-message: >
+            This issue has been automatically marked as stale because it has not had activity within 60 days.
+            It will be automatically closed if no further activity occurs within 30 days.
+          close-issue-message: >
+            This issue has been automatically closed due to inactivity.
+            Please feel free to reopen if you feel it is still relevant!
+          days-before-issue-stale: 60
+          days-before-issue-close: 30
+          stale-pr-label: 'stale'
+          stale-pr-message: >
+            This pull request has been automatically marked as stale because it has not had activity within 60 days.
+            It will be automatically closed if no further activity occurs within 30 days.
+          close-pr-message: >
+            This pull request has been automatically closed due to inactivity.
+            Please feel free to reopen if you intend to continue working on it!
+          days-before-pr-stale: 60
+          days-before-pr-close: 30
+          operations-per-run: 300
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 517b5c39a..c7a30e9b8 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -33,7 +33,7 @@ jobs:
 
       - name: Run unit tests
         run: |
-          uv run --python ${{ matrix.python }} --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest --cov=llama_stack -s -v tests/unit/ --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
+          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
 
       - name: Upload test results
         if: always()
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 926ae21cc..072fa21e2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,6 +8,7 @@ repos:
     rev: v5.0.0  # Latest stable version
     hooks:
     -   id: check-merge-conflict
+        args: ['--assume-in-merge']
     -   id: trailing-whitespace
         exclude: '\.py$'  # Exclude Python files as Ruff already handles them
     -   id: check-added-large-files
@@ -82,6 +83,17 @@ repos:
         require_serial: true
         files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
 
+-   repo: local
+    hooks:
+      - id: openapi-codegen
+        name: API Spec Codegen
+        additional_dependencies:
+          - uv==0.6.2
+        entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null 2>&1'
+        language: python
+        pass_filenames: false
+        require_serial: true
+
 ci:
     autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
     autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 71e610064..1f188f259 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -108,6 +108,22 @@ uv run pre-commit run --all-files
 > [!CAUTION]
 > Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 
+## Running unit tests
+
+You can run the unit tests by running:
+
+```bash
+source .venv/bin/activate
+./scripts/unit-tests.sh
+```
+
+If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows:
+
+```
+source .venv/bin/activate
+PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
+```
+
 ## Adding a new dependency to the project
 
 To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
diff --git a/README.md b/README.md
index 6e1fd088e..aade9c15f 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,10 @@ Here is a list of the various API providers and available distributions that can
 |        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
 |    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
 |           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |
+|          OpenAI          |         Hosted         |            |       ✅       |            |            |               |
+|        Anthropic         |         Hosted         |            |       ✅       |            |            |               |
+|          Gemini          |         Hosted         |            |       ✅       |            |            |               |
+
 
 ### Distributions
 
diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 63f94654a..33b497a33 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -30,6 +30,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "cerebras": [
@@ -62,6 +63,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -97,6 +99,7 @@
     "sqlite-vec",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -132,6 +135,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -168,6 +172,7 @@
     "sqlite-vec",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -203,6 +208,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -236,6 +242,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "hf-endpoint": [
@@ -270,6 +277,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "hf-serverless": [
@@ -304,6 +312,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -344,6 +353,7 @@
     "torchvision",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "zmq"
   ],
@@ -385,6 +395,7 @@
     "torchvision",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "zmq"
   ],
@@ -414,6 +425,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "ollama": [
@@ -448,6 +460,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
   "open-benchmark": [
@@ -482,8 +495,44 @@
     "together",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn"
   ],
+  "passthrough": [
+    "aiosqlite",
+    "autoevals",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "datasets",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "mcp",
+    "nltk",
+    "numpy",
+    "openai",
+    "opentelemetry-exporter-otlp-proto-http",
+    "opentelemetry-sdk",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pymongo",
+    "pypdf",
+    "redis",
+    "requests",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "tree_sitter",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
+  ],
   "remote-vllm": [
     "aiosqlite",
     "autoevals",
@@ -514,6 +563,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -579,6 +629,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -614,6 +665,7 @@
     "together",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
@@ -648,6 +700,7 @@
     "sentencepiece",
     "tqdm",
     "transformers",
+    "tree_sitter",
     "uvicorn",
     "vllm",
     "sentence-transformers --no-deps",
diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
index 22fa781ac..b5e4097d9 100644
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@@ -2151,6 +2151,48 @@
                 }
             }
         },
+        "/v1/providers/{provider_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ProviderInfo"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Providers"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "provider_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
         "/v1/tool-runtime/invoke": {
             "post": {
                 "responses": {
@@ -2642,7 +2684,7 @@
                 }
             }
         },
-        "/v1/inspect/providers": {
+        "/v1/providers": {
             "get": {
                 "responses": {
                     "200": {
@@ -7912,6 +7954,53 @@
                 ],
                 "title": "InsertChunksRequest"
             },
+            "ProviderInfo": {
+                "type": "object",
+                "properties": {
+                    "api": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "provider_type": {
+                        "type": "string"
+                    },
+                    "config": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "api",
+                    "provider_id",
+                    "provider_type",
+                    "config"
+                ],
+                "title": "ProviderInfo"
+            },
             "InvokeToolRequest": {
                 "type": "object",
                 "properties": {
@@ -8124,27 +8213,6 @@
                 ],
                 "title": "ListModelsResponse"
             },
-            "ProviderInfo": {
-                "type": "object",
-                "properties": {
-                    "api": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "provider_type": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "api",
-                    "provider_id",
-                    "provider_type"
-                ],
-                "title": "ProviderInfo"
-            },
             "ListProvidersResponse": {
                 "type": "object",
                 "properties": {
@@ -10145,6 +10213,10 @@
         {
             "name": "PostTraining (Coming Soon)"
         },
+        {
+            "name": "Providers",
+            "x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
+        },
         {
             "name": "Safety"
         },
@@ -10191,6 +10263,7 @@
                 "Inspect",
                 "Models",
                 "PostTraining (Coming Soon)",
+                "Providers",
                 "Safety",
                 "Scoring",
                 "ScoringFunctions",
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
index 1f01351e9..bf2343ede 100644
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@@ -1444,6 +1444,34 @@ paths:
             schema:
               $ref: '#/components/schemas/InsertChunksRequest'
         required: true
+  /v1/providers/{provider_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ProviderInfo'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Providers
+      description: ''
+      parameters:
+        - name: provider_id
+          in: path
+          required: true
+          schema:
+            type: string
   /v1/tool-runtime/invoke:
     post:
       responses:
@@ -1782,7 +1810,7 @@ paths:
             schema:
               $ref: '#/components/schemas/RegisterModelRequest'
         required: true
-  /v1/inspect/providers:
+  /v1/providers:
     get:
       responses:
         '200':
@@ -5409,6 +5437,32 @@ components:
         - vector_db_id
         - chunks
       title: InsertChunksRequest
+    ProviderInfo:
+      type: object
+      properties:
+        api:
+          type: string
+        provider_id:
+          type: string
+        provider_type:
+          type: string
+        config:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - api
+        - provider_id
+        - provider_type
+        - config
+      title: ProviderInfo
     InvokeToolRequest:
       type: object
       properties:
@@ -5544,21 +5598,6 @@ components:
       required:
         - data
       title: ListModelsResponse
-    ProviderInfo:
-      type: object
-      properties:
-        api:
-          type: string
-        provider_id:
-          type: string
-        provider_type:
-          type: string
-      additionalProperties: false
-      required:
-        - api
-        - provider_id
-        - provider_type
-      title: ProviderInfo
     ListProvidersResponse:
       type: object
       properties:
@@ -6832,6 +6871,9 @@ tags:
   - name: Inspect
   - name: Models
   - name: PostTraining (Coming Soon)
+  - name: Providers
+    x-displayName: >-
+      Providers API for inspecting, listing, and modifying providers and their configurations.
   - name: Safety
   - name: Scoring
   - name: ScoringFunctions
@@ -6856,6 +6898,7 @@ x-tagGroups:
       - Inspect
       - Models
       - PostTraining (Coming Soon)
+      - Providers
       - Safety
       - Scoring
       - ScoringFunctions
diff --git a/docs/source/distributions/self_hosted_distro/passthrough.md b/docs/source/distributions/self_hosted_distro/passthrough.md
new file mode 100644
index 000000000..558d7ca08
--- /dev/null
+++ b/docs/source/distributions/self_hosted_distro/passthrough.md
@@ -0,0 +1,42 @@
+---
+orphan: true
+---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
+# Passthrough Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-passthrough` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
+| inference | `remote::passthrough`, `inline::sentence-transformers` |
+| safety | `inline::llama-guard` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `PASSTHROUGH_API_KEY`: Passthrough API Key (default: ``)
+- `PASSTHROUGH_URL`: Passthrough URL (default: ``)
+
+### Models
+
+The following models are available by default:
+
+- `llama3.1-8b-instruct `
+- `llama3.2-11b-vision-instruct `
diff --git a/docs/source/index.md b/docs/source/index.md
index 0d0508466..af381f9c9 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -61,6 +61,10 @@ A number of "adapters" are available for some popular Inference and Vector Store
 |  Groq  |  Hosted  |
 |  SambaNova  |  Hosted  |
 | PyTorch ExecuTorch | On-device iOS, Android |
+|  OpenAI  |  Hosted  |
+|  Anthropic  |  Hosted  |
+|  Gemini  |  Hosted  |
+
 
 **Vector IO API**
 |  **Provider** |  **Environments** |
diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py
index 842a2b63d..f644e5137 100644
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@@ -14,6 +14,7 @@ from llama_stack.schema_utils import json_schema_type
 
 @json_schema_type
 class Api(Enum):
+    providers = "providers"
     inference = "inference"
     safety = "safety"
     agents = "agents"
diff --git a/llama_stack/apis/inspect/inspect.py b/llama_stack/apis/inspect/inspect.py
index 4a647a2d9..25937bb61 100644
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@@ -11,13 +11,6 @@ from pydantic import BaseModel
 from llama_stack.schema_utils import json_schema_type, webmethod
 
 
-@json_schema_type
-class ProviderInfo(BaseModel):
-    api: str
-    provider_id: str
-    provider_type: str
-
-
 @json_schema_type
 class RouteInfo(BaseModel):
     route: str
@@ -32,14 +25,21 @@ class HealthInfo(BaseModel):
 
 
 @json_schema_type
-class VersionInfo(BaseModel):
-    version: str
+class ProviderInfo(BaseModel):
+    api: str
+    provider_id: str
+    provider_type: str
 
 
 class ListProvidersResponse(BaseModel):
     data: List[ProviderInfo]
 
 
+@json_schema_type
+class VersionInfo(BaseModel):
+    version: str
+
+
 class ListRoutesResponse(BaseModel):
     data: List[RouteInfo]
 
diff --git a/llama_stack/apis/providers/__init__.py b/llama_stack/apis/providers/__init__.py
new file mode 100644
index 000000000..b554a5d23
--- /dev/null
+++ b/llama_stack/apis/providers/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .providers import *  # noqa: F401 F403
diff --git a/llama_stack/apis/providers/providers.py b/llama_stack/apis/providers/providers.py
new file mode 100644
index 000000000..83d03d7c1
--- /dev/null
+++ b/llama_stack/apis/providers/providers.py
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Any, Dict, List, Protocol, runtime_checkable
+
+from pydantic import BaseModel
+
+from llama_stack.schema_utils import json_schema_type, webmethod
+
+
+@json_schema_type
+class ProviderInfo(BaseModel):
+    api: str
+    provider_id: str
+    provider_type: str
+    config: Dict[str, Any]
+
+
+class ListProvidersResponse(BaseModel):
+    data: List[ProviderInfo]
+
+
+@runtime_checkable
+class Providers(Protocol):
+    """
+    Providers API for inspecting, listing, and modifying providers and their configurations.
+    """
+
+    @webmethod(route="/providers", method="GET")
+    async def list_providers(self) -> ListProvidersResponse: ...
+
+    @webmethod(route="/providers/{provider_id}", method="GET")
+    async def inspect_provider(self, provider_id: str) -> ProviderInfo: ...
diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py
index b43d50217..f1b722183 100644
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@@ -10,7 +10,7 @@ import json
 import os
 import shutil
 from dataclasses import dataclass
-from datetime import datetime
+from datetime import datetime, timezone
 from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional
@@ -404,7 +404,7 @@ def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
         d = json.load(f)
         manifest = Manifest(**d)
 
-    if datetime.now() > manifest.expires_on:
+    if datetime.now(timezone.utc) > manifest.expires_on:
         raise ValueError(f"Manifest URLs have expired on {manifest.expires_on}")
 
     console = Console()
diff --git a/llama_stack/cli/model/prompt_format.py b/llama_stack/cli/model/prompt_format.py
index 8058db461..e2817b790 100644
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@@ -41,8 +41,14 @@ class ModelPromptFormat(Subcommand):
             "-m",
             "--model-name",
             type=str,
-            default="llama3_1",
-            help="Model Family (llama3_1, llama3_X, etc.)",
+            help="Example: Llama3.1-8B or Llama3.2-11B-Vision, etc\n"
+            "(Run `llama model list` to see a list of valid model names)",
+        )
+        self.parser.add_argument(
+            "-l",
+            "--list",
+            action="store_true",
+            help="List all available models",
         )
         self.parser.add_argument(
             "-l",
@@ -60,7 +66,6 @@ class ModelPromptFormat(Subcommand):
         ]
 
         model_list = [m.value for m in supported_model_ids]
-        model_str = "\n".join(model_list)
 
         if args.list:
             headers = ["Model(s)"]
@@ -81,10 +86,16 @@ class ModelPromptFormat(Subcommand):
         try:
             model_id = CoreModelId(args.model_name)
         except ValueError:
-            self.parser.error(f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}")
+            self.parser.error(
+                f"{args.model_name} is not a valid Model. Choose one from the list of valid models. "
+                f"Run `llama model list` to see the valid model names."
+            )
 
         if model_id not in supported_model_ids:
-            self.parser.error(f"{model_id} is not a valid Model. Choose one from --\n {model_str}")
+            self.parser.error(
+                f"{model_id} is not a valid Model. Choose one from the list of valid models. "
+                f"Run `llama model list` to see the valid model names."
+            )
 
         llama_3_1_file = ROOT_DIR / "models" / "llama" / "llama3_1" / "prompt_format.md"
         llama_3_2_text_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "text_prompt_format.md"
diff --git a/llama_stack/distribution/configure.py b/llama_stack/distribution/configure.py
index 715bb5db4..2a3bf7053 100644
--- a/llama_stack/distribution/configure.py
+++ b/llama_stack/distribution/configure.py
@@ -62,7 +62,7 @@ def configure_api_providers(config: StackRunConfig, build_spec: DistributionSpec
     if config.apis:
         apis_to_serve = config.apis
     else:
-        apis_to_serve = [a.value for a in Api if a not in (Api.telemetry, Api.inspect)]
+        apis_to_serve = [a.value for a in Api if a not in (Api.telemetry, Api.inspect, Api.providers)]
 
     for api_str in apis_to_serve:
         api = Api(api_str)
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index f62996081..7e1d8c016 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -117,6 +117,14 @@ class Provider(BaseModel):
     config: Dict[str, Any]
 
 
+class LoggingConfig(BaseModel):
+    category_levels: Dict[str, str] = Field(
+        default_factory=Dict,
+        description="""
+ Dictionary of different logging configurations for different portions (ex: core, server) of llama stack""",
+    )
+
+
 class ServerConfig(BaseModel):
     port: int = Field(
         default=8321,
@@ -176,6 +184,8 @@ a default SQLite store will be used.""",
     benchmarks: List[BenchmarkInput] = Field(default_factory=list)
     tool_groups: List[ToolGroupInput] = Field(default_factory=list)
 
+    logging: Optional[LoggingConfig] = Field(default=None, description="Configuration for Llama Stack Logging")
+
     server: ServerConfig = Field(
         default_factory=ServerConfig,
         description="Configuration for the HTTP(S) server",
diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py
index 308081415..ddb727663 100644
--- a/llama_stack/distribution/distribution.py
+++ b/llama_stack/distribution/distribution.py
@@ -56,7 +56,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
 
 def providable_apis() -> List[Api]:
     routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
-    return [api for api in Api if api not in routing_table_apis and api != Api.inspect]
+    return [api for api in Api if api not in routing_table_apis and api != Api.inspect and api != Api.providers]
 
 
 def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]:
diff --git a/llama_stack/distribution/providers.py b/llama_stack/distribution/providers.py
new file mode 100644
index 000000000..fb2476767
--- /dev/null
+++ b/llama_stack/distribution/providers.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from pydantic import BaseModel
+
+from llama_stack.apis.providers import ListProvidersResponse, ProviderInfo, Providers
+
+from .datatypes import StackRunConfig
+from .stack import redact_sensitive_fields
+
+
+class ProviderImplConfig(BaseModel):
+    run_config: StackRunConfig
+
+
+async def get_provider_impl(config, deps):
+    impl = ProviderImpl(config, deps)
+    await impl.initialize()
+    return impl
+
+
+class ProviderImpl(Providers):
+    def __init__(self, config, deps):
+        self.config = config
+        self.deps = deps
+
+    async def initialize(self) -> None:
+        pass
+
+    async def list_providers(self) -> ListProvidersResponse:
+        run_config = self.config.run_config
+        safe_config = StackRunConfig(**redact_sensitive_fields(run_config.model_dump()))
+        ret = []
+        for api, providers in safe_config.providers.items():
+            ret.extend(
+                [
+                    ProviderInfo(
+                        api=api,
+                        provider_id=p.provider_id,
+                        provider_type=p.provider_type,
+                        config=p.config,
+                    )
+                    for p in providers
+                ]
+            )
+
+        return ListProvidersResponse(data=ret)
+
+    async def inspect_provider(self, provider_id: str) -> ProviderInfo:
+        all_providers = await self.list_providers()
+        for p in all_providers.data:
+            if p.provider_id == provider_id:
+                return p
+
+        raise ValueError(f"Provider {provider_id} not found")
diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py
index ab075f399..e9e406699 100644
--- a/llama_stack/distribution/resolver.py
+++ b/llama_stack/distribution/resolver.py
@@ -16,6 +16,7 @@ from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
+from llama_stack.apis.providers import Providers as ProvidersAPI
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
@@ -59,6 +60,7 @@ class InvalidProviderError(Exception):
 
 def api_protocol_map() -> Dict[Api, Any]:
     return {
+        Api.providers: ProvidersAPI,
         Api.agents: Agents,
         Api.inference: Inference,
         Api.inspect: Inspect,
@@ -247,6 +249,25 @@ def sort_providers_by_deps(
         )
     )
 
+    sorted_providers.append(
+        (
+            "providers",
+            ProviderWithSpec(
+                provider_id="__builtin__",
+                provider_type="__builtin__",
+                config={"run_config": run_config.model_dump()},
+                spec=InlineProviderSpec(
+                    api=Api.providers,
+                    provider_type="__builtin__",
+                    config_class="llama_stack.distribution.providers.ProviderImplConfig",
+                    module="llama_stack.distribution.providers",
+                    api_dependencies=apis,
+                    deps__=[x.value for x in apis],
+                ),
+            ),
+        )
+    )
+
     logger.debug(f"Resolved {len(sorted_providers)} providers")
     for api_str, provider in sorted_providers:
         logger.debug(f" {api_str} => {provider.provider_id}")
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index 7ca009b13..b37b3a007 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -25,7 +25,7 @@ from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel, ValidationError
 from typing_extensions import Annotated
 
-from llama_stack.distribution.datatypes import StackRunConfig
+from llama_stack.distribution.datatypes import LoggingConfig, StackRunConfig
 from llama_stack.distribution.distribution import builtin_automatically_routed_apis
 from llama_stack.distribution.request_headers import (
     PROVIDER_DATA_VAR,
@@ -306,34 +306,42 @@ def main():
 
     args = parser.parse_args()
 
-    if args.env:
-        for env_pair in args.env:
-            try:
-                key, value = validate_env_pair(env_pair)
-                logger.info(f"Setting CLI environment variable {key} => {value}")
-                os.environ[key] = value
-            except ValueError as e:
-                logger.error(f"Error: {str(e)}")
-                sys.exit(1)
-
+    log_line = ""
     if args.yaml_config:
         # if the user provided a config file, use it, even if template was specified
         config_file = Path(args.yaml_config)
         if not config_file.exists():
             raise ValueError(f"Config file {config_file} does not exist")
-        logger.info(f"Using config file: {config_file}")
+        log_line = f"Using config file: {config_file}"
     elif args.template:
         config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml"
         if not config_file.exists():
             raise ValueError(f"Template {args.template} does not exist")
-        logger.info(f"Using template {args.template} config file: {config_file}")
+        log_line = f"Using template {args.template} config file: {config_file}"
     else:
         raise ValueError("Either --yaml-config or --template must be provided")
 
+    logger_config = None
     with open(config_file, "r") as fp:
-        config = replace_env_vars(yaml.safe_load(fp))
+        config_contents = yaml.safe_load(fp)
+        if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
+            logger_config = LoggingConfig(**cfg)
+        logger = get_logger(name=__name__, category="server", config=logger_config)
+        if args.env:
+            for env_pair in args.env:
+                try:
+                    key, value = validate_env_pair(env_pair)
+                    logger.info(f"Setting CLI environment variable {key} => {value}")
+                    os.environ[key] = value
+                except ValueError as e:
+                    logger.error(f"Error: {str(e)}")
+                    sys.exit(1)
+        config = replace_env_vars(config_contents)
         config = StackRunConfig(**config)
 
+    # now that the logger is initialized, print the line about which type of config we are using.
+    logger.info(log_line)
+
     logger.info("Run configuration:")
     safe_config = redact_sensitive_fields(config.model_dump())
     logger.info(yaml.dump(safe_config, indent=2))
@@ -368,6 +376,7 @@ def main():
         apis_to_serve.add(inf.routing_table_api.value)
 
     apis_to_serve.add("inspect")
+    apis_to_serve.add("providers")
     for api_str in apis_to_serve:
         api = Api(api_str)
 
diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py
index 2b974739a..9c9289a77 100644
--- a/llama_stack/distribution/stack.py
+++ b/llama_stack/distribution/stack.py
@@ -23,6 +23,7 @@ from llama_stack.apis.inference import Inference
 from llama_stack.apis.inspect import Inspect
 from llama_stack.apis.models import Models
 from llama_stack.apis.post_training import PostTraining
+from llama_stack.apis.providers import Providers
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
@@ -44,6 +45,7 @@ logger = get_logger(name=__name__, category="core")
 
 
 class LlamaStack(
+    Providers,
     VectorDBs,
     Inference,
     BatchInference,
diff --git a/llama_stack/log.py b/llama_stack/log.py
index 572dea234..0ba95d547 100644
--- a/llama_stack/log.py
+++ b/llama_stack/log.py
@@ -7,13 +7,15 @@
 import logging
 import os
 from logging.config import dictConfig
-from typing import Dict
+from typing import Dict, Optional
 
 from rich.console import Console
 from rich.errors import MarkupError
 from rich.logging import RichHandler
 from termcolor import cprint
 
+from .distribution.datatypes import LoggingConfig
+
 # Default log level
 DEFAULT_LOG_LEVEL = logging.INFO
 
@@ -34,6 +36,56 @@ CATEGORIES = [
 _category_levels: Dict[str, int] = {category: DEFAULT_LOG_LEVEL for category in CATEGORIES}
 
 
+def config_to_category_levels(category: str, level: str):
+    """
+    Helper function to be called either by environment parsing or yaml parsing to go from a list of categories and levels to a dictionary ready to be
+    used by the logger dictConfig.
+
+    Parameters:
+        category (str): logging category to apply the level to
+        level (str): logging level to be used in the category
+
+    Returns:
+        Dict[str, int]: A dictionary mapping categories to their log levels.
+    """
+
+    category_levels: Dict[str, int] = {}
+    level_value = logging._nameToLevel.get(str(level).upper())
+    if level_value is None:
+        logging.warning(f"Unknown log level '{level}' for category '{category}'. Falling back to default 'INFO'.")
+        return category_levels
+
+    if category == "all":
+        # Apply the log level to all categories and the root logger
+        for cat in CATEGORIES:
+            category_levels[cat] = level_value
+        # Set the root logger's level to the specified level
+        category_levels["root"] = level_value
+    elif category in CATEGORIES:
+        category_levels[category] = level_value
+        logging.info(f"Setting '{category}' category to level '{level}'.")
+    else:
+        logging.warning(f"Unknown logging category: {category}. No changes made.")
+    return category_levels
+
+
+def parse_yaml_config(yaml_config: LoggingConfig) -> Dict[str, int]:
+    """
+    Helper function to parse a yaml logging configuration found in the run.yaml
+
+    Parameters:
+        yaml_config (Logging): the logger config object found in the run.yaml
+
+    Returns:
+        Dict[str, int]: A dictionary mapping categories to their log levels.
+    """
+    category_levels = {}
+    for category, level in yaml_config.category_levels.items():
+        category_levels.update(config_to_category_levels(category=category, level=level))
+
+    return category_levels
+
+
 def parse_environment_config(env_config: str) -> Dict[str, int]:
     """
     Parse the LLAMA_STACK_LOGGING environment variable and return a dictionary of category log levels.
@@ -53,25 +105,7 @@ def parse_environment_config(env_config: str) -> Dict[str, int]:
             category, level = pair.split("=", 1)
             category = category.strip().lower()
             level = level.strip().upper()  # Convert to uppercase for logging._nameToLevel
-
-            level_value = logging._nameToLevel.get(level)
-            if level_value is None:
-                logging.warning(
-                    f"Unknown log level '{level}' for category '{category}'. Falling back to default 'INFO'."
-                )
-                continue
-
-            if category == "all":
-                # Apply the log level to all categories and the root logger
-                for cat in CATEGORIES:
-                    category_levels[cat] = level_value
-                # Set the root logger's level to the specified level
-                category_levels["root"] = level_value
-            elif category in CATEGORIES:
-                category_levels[category] = level_value
-                logging.info(f"Setting '{category}' category to level '{level}'.")
-            else:
-                logging.warning(f"Unknown logging category: {category}. No changes made.")
+            category_levels.update(config_to_category_levels(category=category, level=level))
 
         except ValueError:
             logging.warning(f"Invalid logging configuration: '{pair}'. Expected format: 'category=level'.")
@@ -176,7 +210,9 @@ def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None
             logger.setLevel(root_level)
 
 
-def get_logger(name: str, category: str = "uncategorized") -> logging.LoggerAdapter:
+def get_logger(
+    name: str, category: str = "uncategorized", config: Optional[LoggingConfig] | None = None
+) -> logging.LoggerAdapter:
     """
     Returns a logger with the specified name and category.
     If no category is provided, defaults to 'uncategorized'.
@@ -184,10 +220,14 @@ def get_logger(name: str, category: str = "uncategorized") -> logging.LoggerAdap
     Parameters:
         name (str): The name of the logger (e.g., module or filename).
         category (str): The category of the logger (default 'uncategorized').
+        config (Logging): optional yaml config to override the existing logger configuration
 
     Returns:
         logging.LoggerAdapter: Configured logger with category support.
     """
+    if config:
+        _category_levels.update(parse_yaml_config(config))
+
     logger = logging.getLogger(name)
     logger.setLevel(_category_levels.get(category, DEFAULT_LOG_LEVEL))
     return logging.LoggerAdapter(logger, {"category": category})
diff --git a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
index 02e4814bc..b835d0ec0 100644
--- a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
+++ b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py
@@ -34,7 +34,9 @@ class SystemDefaultGenerator(PromptTemplateGeneratorBase):
         )
         return PromptTemplate(
             template_str.lstrip("\n"),
-            {"today": datetime.now().strftime("%d %B %Y")},
+            {
+                "today": datetime.now().strftime("%d %B %Y")  # noqa: DTZ005 - we don't care about timezones here since we are displaying the date
+            },
         )
 
     def data_examples(self) -> List[Any]:
diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
index 0ae1996cc..03692bcc7 100644
--- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py
@@ -11,7 +11,7 @@ import re
 import secrets
 import string
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import AsyncGenerator, List, Optional, Union
 from urllib.parse import urlparse
 
@@ -239,7 +239,7 @@ class ChatAgent(ShieldRunnerMixin):
             in_progress_tool_call_step = await self.storage.get_in_progress_tool_call_step(
                 request.session_id, request.turn_id
             )
-            now = datetime.now().astimezone().isoformat()
+            now = datetime.now(timezone.utc).isoformat()
             tool_execution_step = ToolExecutionStep(
                 step_id=(in_progress_tool_call_step.step_id if in_progress_tool_call_step else str(uuid.uuid4())),
                 turn_id=request.turn_id,
@@ -264,7 +264,7 @@ class ChatAgent(ShieldRunnerMixin):
             start_time = last_turn.started_at
         else:
             messages.extend(request.messages)
-            start_time = datetime.now().astimezone().isoformat()
+            start_time = datetime.now(timezone.utc).isoformat()
             input_messages = request.messages
 
         output_message = None
@@ -295,7 +295,7 @@ class ChatAgent(ShieldRunnerMixin):
             input_messages=input_messages,
             output_message=output_message,
             started_at=start_time,
-            completed_at=datetime.now().astimezone().isoformat(),
+            completed_at=datetime.now(timezone.utc).isoformat(),
             steps=steps,
         )
         await self.storage.add_turn_to_session(request.session_id, turn)
@@ -386,7 +386,7 @@ class ChatAgent(ShieldRunnerMixin):
                 return
 
             step_id = str(uuid.uuid4())
-            shield_call_start_time = datetime.now().astimezone().isoformat()
+            shield_call_start_time = datetime.now(timezone.utc).isoformat()
             try:
                 yield AgentTurnResponseStreamChunk(
                     event=AgentTurnResponseEvent(
@@ -410,7 +410,7 @@ class ChatAgent(ShieldRunnerMixin):
                                 turn_id=turn_id,
                                 violation=e.violation,
                                 started_at=shield_call_start_time,
-                                completed_at=datetime.now().astimezone().isoformat(),
+                                completed_at=datetime.now(timezone.utc).isoformat(),
                             ),
                         )
                     )
@@ -433,7 +433,7 @@ class ChatAgent(ShieldRunnerMixin):
                             turn_id=turn_id,
                             violation=None,
                             started_at=shield_call_start_time,
-                            completed_at=datetime.now().astimezone().isoformat(),
+                            completed_at=datetime.now(timezone.utc).isoformat(),
                         ),
                     )
                 )
@@ -472,7 +472,7 @@ class ChatAgent(ShieldRunnerMixin):
             client_tools[tool.name] = tool
         while True:
             step_id = str(uuid.uuid4())
-            inference_start_time = datetime.now().astimezone().isoformat()
+            inference_start_time = datetime.now(timezone.utc).isoformat()
             yield AgentTurnResponseStreamChunk(
                 event=AgentTurnResponseEvent(
                     payload=AgentTurnResponseStepStartPayload(
@@ -582,7 +582,7 @@ class ChatAgent(ShieldRunnerMixin):
                             turn_id=turn_id,
                             model_response=copy.deepcopy(message),
                             started_at=inference_start_time,
-                            completed_at=datetime.now().astimezone().isoformat(),
+                            completed_at=datetime.now(timezone.utc).isoformat(),
                         ),
                     )
                 )
@@ -653,7 +653,7 @@ class ChatAgent(ShieldRunnerMixin):
                             turn_id=turn_id,
                             tool_calls=[tool_call],
                             tool_responses=[],
-                            started_at=datetime.now().astimezone().isoformat(),
+                            started_at=datetime.now(timezone.utc).isoformat(),
                         ),
                     )
                     yield message
@@ -670,7 +670,7 @@ class ChatAgent(ShieldRunnerMixin):
                         "input": message.model_dump_json(),
                     },
                 ) as span:
-                    tool_execution_start_time = datetime.now().astimezone().isoformat()
+                    tool_execution_start_time = datetime.now(timezone.utc).isoformat()
                     tool_call = message.tool_calls[0]
                     tool_result = await self.execute_tool_call_maybe(
                         session_id,
@@ -708,7 +708,7 @@ class ChatAgent(ShieldRunnerMixin):
                                     )
                                 ],
                                 started_at=tool_execution_start_time,
-                                completed_at=datetime.now().astimezone().isoformat(),
+                                completed_at=datetime.now(timezone.utc).isoformat(),
                             ),
                         )
                     )
diff --git a/llama_stack/providers/inline/agents/meta_reference/persistence.py b/llama_stack/providers/inline/agents/meta_reference/persistence.py
index 2c04305fd..e7d7d1828 100644
--- a/llama_stack/providers/inline/agents/meta_reference/persistence.py
+++ b/llama_stack/providers/inline/agents/meta_reference/persistence.py
@@ -7,7 +7,7 @@
 import json
 import logging
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import List, Optional
 
 from pydantic import BaseModel
@@ -36,7 +36,7 @@ class AgentPersistence:
         session_info = AgentSessionInfo(
             session_id=session_id,
             session_name=name,
-            started_at=datetime.now(),
+            started_at=datetime.now(timezone.utc),
         )
         await self.kvstore.set(
             key=f"session:{self.agent_id}:{session_id}",
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
index a1bebaa4c..85b351262 100644
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -12,7 +12,7 @@ from llama_stack.apis.agents import Agents, StepType
 from llama_stack.apis.benchmarks import Benchmark
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
-from llama_stack.apis.inference import Inference, UserMessage
+from llama_stack.apis.inference import Inference, SystemMessage, UserMessage
 from llama_stack.apis.scoring import Scoring
 from llama_stack.distribution.datatypes import Api
 from llama_stack.providers.datatypes import BenchmarksProtocolPrivate
@@ -118,7 +118,7 @@ class MetaReferenceEvalImpl(
         for i, x in tqdm(enumerate(input_rows)):
             assert ColumnName.chat_completion_input.value in x, "Invalid input row"
             input_messages = json.loads(x[ColumnName.chat_completion_input.value])
-            input_messages = [UserMessage(**x) for x in input_messages]
+            input_messages = [UserMessage(**x) for x in input_messages if x["role"] == "user"]
 
             # NOTE: only single-turn agent generation is supported. Create a new session for each input row
             session_create_response = await self.agents_api.create_agent_session(agent_id, f"session-{i}")
@@ -168,10 +168,11 @@ class MetaReferenceEvalImpl(
                 generations.append({ColumnName.generated_answer.value: response.completion_message.content})
             elif ColumnName.chat_completion_input.value in x:
                 chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
-                input_messages = [UserMessage(**x) for x in chat_completion_input_json]
+                input_messages = [UserMessage(**x) for x in chat_completion_input_json if x["role"] == "user"]
                 messages = []
                 if candidate.system_message:
                     messages.append(candidate.system_message)
+                messages += [SystemMessage(**x) for x in chat_completion_input_json if x["role"] == "system"]
                 messages += input_messages
                 response = await self.inference_api.chat_completion(
                     model_id=candidate.model,
diff --git a/llama_stack/providers/inline/post_training/torchtune/post_training.py b/llama_stack/providers/inline/post_training/torchtune/post_training.py
index 3a1affc91..2c129ef41 100644
--- a/llama_stack/providers/inline/post_training/torchtune/post_training.py
+++ b/llama_stack/providers/inline/post_training/torchtune/post_training.py
@@ -3,7 +3,7 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Any, Dict, Optional
 
 from llama_stack.apis.datasetio import DatasetIO
@@ -64,7 +64,7 @@ class TorchtunePostTrainingImpl:
         job_status_response = PostTrainingJobStatusResponse(
             job_uuid=job_uuid,
             status=JobStatus.scheduled,
-            scheduled_at=datetime.now(),
+            scheduled_at=datetime.now(timezone.utc),
         )
         self.jobs[job_uuid] = job_status_response
 
@@ -84,7 +84,7 @@ class TorchtunePostTrainingImpl:
                 )
 
                 job_status_response.status = JobStatus.in_progress
-                job_status_response.started_at = datetime.now()
+                job_status_response.started_at = datetime.now(timezone.utc)
 
                 await recipe.setup()
                 resources_allocated, checkpoints = await recipe.train()
@@ -93,7 +93,7 @@ class TorchtunePostTrainingImpl:
                 job_status_response.resources_allocated = resources_allocated
                 job_status_response.checkpoints = checkpoints
                 job_status_response.status = JobStatus.completed
-                job_status_response.completed_at = datetime.now()
+                job_status_response.completed_at = datetime.now(timezone.utc)
 
             except Exception:
                 job_status_response.status = JobStatus.failed
diff --git a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
index c88787f18..941c629e3 100644
--- a/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
+++ b/llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py
@@ -8,7 +8,7 @@ import gc
 import logging
 import os
 import time
-from datetime import datetime
+from datetime import datetime, timezone
 from functools import partial
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
@@ -532,7 +532,7 @@ class LoraFinetuningSingleDevice:
             checkpoint_path = await self.save_checkpoint(epoch=curr_epoch)
             checkpoint = Checkpoint(
                 identifier=f"{self.model_id}-sft-{curr_epoch}",
-                created_at=datetime.now(),
+                created_at=datetime.now(timezone.utc),
                 epoch=curr_epoch,
                 post_training_job_id=self.job_uuid,
                 path=checkpoint_path,
diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py
index 00945b99d..599f5f98c 100644
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@@ -22,12 +22,19 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 )
 
 from .config import BasicScoringConfig
+from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
 from .scoring_fn.regex_parser_math_response_scoring_fn import RegexParserMathResponseScoringFn
 from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn
 from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn
 
-FIXED_FNS = [EqualityScoringFn, SubsetOfScoringFn, RegexParserScoringFn, RegexParserMathResponseScoringFn]
+FIXED_FNS = [
+    EqualityScoringFn,
+    SubsetOfScoringFn,
+    RegexParserScoringFn,
+    RegexParserMathResponseScoringFn,
+    BFCLScoringFn,
+]
 
 
 class BasicScoringImpl(
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
new file mode 100644
index 000000000..f37780f3e
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import re
+from typing import Any, Dict, Optional
+
+from llama_stack.apis.scoring import ScoringResultRow
+from llama_stack.apis.scoring_functions import ScoringFnParams
+from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
+
+from ..utils.bfcl.ast_parser import decode_ast
+from ..utils.bfcl.checker import ast_checker, is_empty_output
+from .fn_defs.bfcl import bfcl
+
+
+def postprocess(x: Dict[str, Any], test_category: str) -> Dict[str, Any]:
+    contain_func_call = False
+    error = None
+    error_type = None
+    checker_result = {}
+    try:
+        prediction = decode_ast(x["generated_answer"], x["language"]) or ""
+        contain_func_call = True
+        # if not is_function_calling_format_output(prediction):
+        if is_empty_output(prediction):
+            contain_func_call = False
+            error = "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
+            error_type = "ast_decoder:decoder_wrong_output_format"
+        else:
+            checker_result = ast_checker(
+                json.loads(x["function"]),
+                prediction,
+                json.loads(x["ground_truth"]),
+                x["language"],
+                test_category=test_category,
+                model_name="",
+            )
+    except Exception as e:
+        prediction = ""
+        error = f"Invalid syntax. Failed to decode AST. {str(e)}"
+        error_type = "ast_decoder:decoder_failed"
+    return {
+        "prediction": prediction,
+        "contain_func_call": contain_func_call,
+        "valid": checker_result.get("valid", False),
+        "error": error or checker_result.get("error", ""),
+        "error_type": error_type or checker_result.get("error_type", ""),
+    }
+
+
+def gen_valid(x: Dict[str, Any]) -> Dict[str, float]:
+    return {"valid": x["valid"]}
+
+
+def gen_relevance_acc(x: Dict[str, Any]) -> Dict[str, float]:
+    # This function serves for both relevance and irrelevance tests, which share the exact opposite logic.
+    # If `test_category` is "irrelevance", the model is expected to output no function call.
+    # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`).
+    # If `test_category` is "relevance", the model is expected to output to a function call, and empty list doesn't count as a function call.
+    acc = not x["contain_func_call"] if "irrelevance" in x["id"] else x["contain_func_call"]
+    return {"valid": float(acc)}
+
+
+class BFCLScoringFn(RegisteredBaseScoringFn):
+    """
+    A scoring_fn for BFCL
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.supported_fn_defs_registry = {
+            bfcl.identifier: bfcl,
+        }
+
+    async def score_row(
+        self,
+        input_row: Dict[str, Any],
+        scoring_fn_identifier: Optional[str] = "bfcl",
+        scoring_params: Optional[ScoringFnParams] = None,
+    ) -> ScoringResultRow:
+        test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"])
+        score_result = postprocess(input_row, test_category)
+        if test_category in {"irrelevance", "live_relevance", "live_irrelevance"}:
+            score = gen_relevance_acc(score_result)["valid"]
+        else:
+            score = gen_valid(score_result)["valid"]
+        return {
+            "score": float(score),
+        }
diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
new file mode 100644
index 000000000..392d92c86
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from llama_stack.apis.common.type_system import NumberType
+from llama_stack.apis.scoring_functions import (
+    AggregationFunctionType,
+    BasicScoringFnParams,
+    ScoringFn,
+)
+
+bfcl = ScoringFn(
+    identifier="basic::bfcl",
+    description="BFCL complex scoring",
+    return_type=NumberType(),
+    provider_id="basic",
+    provider_resource_id="bfcl",
+    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
+)
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
new file mode 100644
index 000000000..445cdfc77
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
@@ -0,0 +1,296 @@
+# ruff: noqa
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import ast
+
+from .tree_sitter import get_parser
+
+
+def parse_java_function_call(source_code):
+    if not source_code.endswith(";"):
+        source_code += ";"  # Necessary for the parser not to register an error
+    parser = get_parser("java")
+    tree = parser.parse(bytes(source_code, "utf8"))
+    root_node = tree.root_node
+
+    if root_node.has_error:
+        raise Exception("Error parsing java the source code.")
+
+    def get_text(node):
+        """Returns the text represented by the node."""
+        return source_code[node.start_byte : node.end_byte]
+
+    def traverse_node(node, nested=False):
+        if node.type == "string_literal":
+            if nested:
+                return get_text(node)
+            # Strip surrounding quotes from string literals
+            return get_text(node)[1:-1]
+        elif node.type == "character_literal":
+            if nested:
+                return get_text(node)
+            # Strip surrounding single quotes from character literals
+            return get_text(node)[1:-1]
+        """Traverse the node to collect texts for complex structures."""
+        if node.type in [
+            "identifier",
+            "class_literal",
+            "type_identifier",
+            "method_invocation",
+        ]:
+            return get_text(node)
+        elif node.type == "array_creation_expression":
+            # Handle array creation expression specifically
+            type_node = node.child_by_field_name("type")
+            value_node = node.child_by_field_name("value")
+            type_text = traverse_node(type_node, True)
+            value_text = traverse_node(value_node, True)
+            return f"new {type_text}[]{value_text}"
+        elif node.type == "object_creation_expression":
+            # Handle object creation expression specifically
+            type_node = node.child_by_field_name("type")
+            arguments_node = node.child_by_field_name("arguments")
+            type_text = traverse_node(type_node, True)
+            if arguments_node:
+                # Process each argument carefully, avoiding unnecessary punctuation
+                argument_texts = []
+                for child in arguments_node.children:
+                    if child.type not in [
+                        ",",
+                        "(",
+                        ")",
+                    ]:  # Exclude commas and parentheses
+                        argument_text = traverse_node(child, True)
+                        argument_texts.append(argument_text)
+                arguments_text = ", ".join(argument_texts)
+                return f"new {type_text}({arguments_text})"
+            else:
+                return f"new {type_text}()"
+        elif node.type == "set":
+            # Handling sets specifically
+            items = [traverse_node(n, True) for n in node.children if n.type not in [",", "set"]]
+            return "{" + ", ".join(items) + "}"
+
+        elif node.child_count > 0:
+            return "".join(traverse_node(child, True) for child in node.children)
+        else:
+            return get_text(node)
+
+    def extract_arguments(args_node):
+        arguments = {}
+        for child in args_node.children:
+            if child.type == "assignment_expression":
+                # For named parameters
+                name_node, value_node = child.children[0], child.children[2]
+                name = get_text(name_node)
+                value = traverse_node(value_node)
+                if name in arguments:
+                    if not isinstance(arguments[name], list):
+                        arguments[name] = [arguments[name]]
+                    arguments[name].append(value)
+                else:
+                    arguments[name] = value
+                # arguments.append({'name': name, 'value': value})
+            elif child.type in ["identifier", "class_literal", "set"]:
+                # For unnamed parameters and handling sets
+                value = traverse_node(child)
+                if None in arguments:
+                    if not isinstance(arguments[None], list):
+                        arguments[None] = [arguments[None]]
+                    arguments[None].append(value)
+                else:
+                    arguments[None] = value
+        return arguments
+
+    def traverse(node):
+        if node.type == "method_invocation":
+            # Extract the function name and its arguments
+            method_name = get_text(node.child_by_field_name("name"))
+            class_name_node = node.child_by_field_name("object")
+            if class_name_node:
+                class_name = get_text(class_name_node)
+                function_name = f"{class_name}.{method_name}"
+            else:
+                function_name = method_name
+            arguments_node = node.child_by_field_name("arguments")
+            if arguments_node:
+                arguments = extract_arguments(arguments_node)
+                for key, value in arguments.items():
+                    if isinstance(value, list):
+                        raise Exception("Error: Multiple arguments with the same name are not supported.")
+                return [{function_name: arguments}]
+
+        else:
+            for child in node.children:
+                result = traverse(child)
+                if result:
+                    return result
+
+    result = traverse(root_node)
+    return result if result else {}
+
+
+def parse_javascript_function_call(source_code):
+    if not source_code.endswith(";"):
+        source_code += ";"  # Necessary for the parser not to register an error
+    parser = get_parser("javascript")
+    # Parse the source code
+    tree = parser.parse(bytes(source_code, "utf8"))
+    root_node = tree.root_node
+    if root_node.has_error:
+        raise Exception("Error js parsing the source code.")
+
+    # Function to recursively extract argument details
+    def extract_arguments(node):
+        args = {}
+        for child in node.children:
+            if child.type == "assignment_expression":
+                # Extract left (name) and right (value) parts of the assignment
+                name = child.children[0].text.decode("utf-8")
+                value = child.children[2].text.decode("utf-8")
+                if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
+                    value = value[1:-1]  # Trim the quotation marks
+                if name in args:
+                    if not isinstance(args[name], list):
+                        args[name] = [args[name]]
+                    args[name].append(value)
+                else:
+                    args[name] = value
+
+            elif child.type == "identifier" or child.type == "true":
+                # Handle non-named arguments and boolean values
+                value = child.text.decode("utf-8")
+                if None in args:
+                    if not isinstance(args[None], list):
+                        args[None] = [args[None]]
+                    args[None].append(value)
+                else:
+                    args[None] = value
+        return args
+
+    # Find the function call and extract its name and arguments
+    if root_node.type == "program":
+        for child in root_node.children:
+            if child.type == "expression_statement":
+                for sub_child in child.children:
+                    if sub_child.type == "call_expression":
+                        function_name = sub_child.children[0].text.decode("utf8")
+                        arguments_node = sub_child.children[1]
+                        parameters = extract_arguments(arguments_node)
+                        for key, value in parameters.items():
+                            if isinstance(value, list):
+                                raise Exception("Error: Multiple arguments with the same name are not supported.")
+                        result = [{function_name: parameters}]
+                        return result
+
+
+def ast_parse(input_str, language="Python"):
+    if language == "Python":
+        cleaned_input = input_str.strip("[]'")
+        parsed = ast.parse(cleaned_input, mode="eval")
+        extracted = []
+        if isinstance(parsed.body, ast.Call):
+            extracted.append(resolve_ast_call(parsed.body))
+        else:
+            for elem in parsed.body.elts:
+                extracted.append(resolve_ast_call(elem))
+        return extracted
+    elif language == "Java":
+        return parse_java_function_call(input_str[1:-1])  # Remove the [ and ] from the string
+    elif language == "JavaScript":
+        return parse_javascript_function_call(input_str[1:-1])
+    else:
+        raise NotImplementedError(f"Unsupported language: {language}")
+
+
+def resolve_ast_call(elem):
+    # Handle nested attributes for deeply nested module paths
+    func_parts = []
+    func_part = elem.func
+    while isinstance(func_part, ast.Attribute):
+        func_parts.append(func_part.attr)
+        func_part = func_part.value
+    if isinstance(func_part, ast.Name):
+        func_parts.append(func_part.id)
+    func_name = ".".join(reversed(func_parts))
+    args_dict = {}
+    # Parse when args are simply passed as an unnamed dictionary arg
+    for arg in elem.args:
+        if isinstance(arg, ast.Dict):
+            for key, value in zip(arg.keys, arg.values):
+                if isinstance(key, ast.Constant):
+                    arg_name = key.value
+                output = resolve_ast_by_type(value)
+                args_dict[arg_name] = output
+    for arg in elem.keywords:
+        output = resolve_ast_by_type(arg.value)
+        args_dict[arg.arg] = output
+    return {func_name: args_dict}
+
+
+def resolve_ast_by_type(value):
+    if isinstance(value, ast.Constant):
+        if value.value is Ellipsis:
+            output = "..."
+        else:
+            output = value.value
+    elif isinstance(value, ast.UnaryOp):
+        output = -value.operand.value
+    elif isinstance(value, ast.List):
+        output = [resolve_ast_by_type(v) for v in value.elts]
+    elif isinstance(value, ast.Dict):
+        output = {resolve_ast_by_type(k): resolve_ast_by_type(v) for k, v in zip(value.keys, value.values)}
+    elif isinstance(value, ast.NameConstant):  # Added this condition to handle boolean values
+        output = value.value
+    elif isinstance(value, ast.BinOp):  # Added this condition to handle function calls as arguments
+        output = eval(ast.unparse(value))
+    elif isinstance(value, ast.Name):
+        output = value.id
+    elif isinstance(value, ast.Call):
+        if len(value.keywords) == 0:
+            output = ast.unparse(value)
+        else:
+            output = resolve_ast_call(value)
+    elif isinstance(value, ast.Tuple):
+        output = tuple(resolve_ast_by_type(v) for v in value.elts)
+    elif isinstance(value, ast.Lambda):
+        output = eval(ast.unparse(value.body[0].value))
+    elif isinstance(value, ast.Ellipsis):
+        output = "..."
+    elif isinstance(value, ast.Subscript):
+        try:
+            output = ast.unparse(value.body[0].value)
+        except:
+            output = ast.unparse(value.value) + "[" + ast.unparse(value.slice) + "]"
+    else:
+        raise Exception(f"Unsupported AST type: {type(value)}")
+    return output
+
+
+def decode_ast(result, language="Python"):
+    func = result
+    func = func.replace("\n", "")  # remove new line characters
+    if not func.startswith("["):
+        func = "[" + func
+    if not func.endswith("]"):
+        func = func + "]"
+    decoded_output = ast_parse(func, language)
+    return decoded_output
+
+
+def decode_execute(result):
+    func = result
+    func = func.replace("\n", "")  # remove new line characters
+    if not func.startswith("["):
+        func = "[" + func
+    if not func.endswith("]"):
+        func = func + "]"
+    decode_output = ast_parse(func)
+    execution_list = []
+    for function_call in decode_output:
+        for key, value in function_call.items():
+            execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})")
+    return execution_list
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
new file mode 100644
index 000000000..f6aab123c
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
@@ -0,0 +1,989 @@
+# ruff: noqa
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+import json
+import re
+import time
+from typing import Any
+
+# Comment out for now until we actually use the rest checker in evals
+# import requests  # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function.
+
+
+class NoAPIKeyError(Exception):
+    def __init__(self):
+        self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
+        super().__init__(self.message)
+
+
+REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2
+
+
+JAVA_TYPE_CONVERSION = {
+    "byte": int,
+    "short": int,
+    "integer": int,
+    "float": float,
+    "double": float,
+    "long": int,
+    "boolean": bool,
+    "char": str,
+    "Array": list,
+    "ArrayList": list,
+    "Set": set,
+    "HashMap": dict,
+    "Hashtable": dict,
+    "Queue": list,  # this can be `queue.Queue` as well, for simplicity we check with list
+    "Stack": list,
+    "String": str,
+    "any": str,
+}
+
+JS_TYPE_CONVERSION = {
+    "String": str,
+    "integer": int,
+    "float": float,
+    "Bigint": int,
+    "Boolean": bool,
+    "dict": dict,
+    "array": list,
+    "any": str,
+}
+
+# We switch to conditional import for the following two imports to avoid unnecessary installations.
+# User doesn't need to setup the tree-sitter packages if they are not running the test for that language.
+# from js_type_converter import js_type_converter
+# from java_type_converter import java_type_converter
+
+PYTHON_TYPE_MAPPING = {
+    "string": str,
+    "integer": int,
+    "float": float,
+    "boolean": bool,
+    "array": list,
+    "tuple": list,
+    "dict": dict,
+    "any": str,
+}
+
+# This is the list of types that we need to recursively check its values
+PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"]
+
+
+NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"]
+
+
+#### Helper functions for AST ####
+def find_description(func_descriptions, name):
+    if type(func_descriptions) == list:
+        for func_description in func_descriptions:
+            if func_description["name"] == name:
+                return func_description
+        return None
+    else:
+        # it is a dict, there is only one function
+        return func_descriptions
+
+
+def get_possible_answer_type(possible_answer: list):
+    for answer in possible_answer:
+        if answer != "":  # Optional parameter
+            return type(answer)
+    return None
+
+
+def type_checker(
+    param: str,
+    value,
+    possible_answer: list,
+    expected_type_description: str,
+    expected_type_converted,
+    nested_type_converted,
+):
+    # NOTE: This type checker only supports nested type checking for one level deep.
+    # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex.
+
+    result: Any = {
+        "valid": True,
+        "error": [],
+        "is_variable": False,
+        "error_type": "type_error:simple",
+    }
+
+    is_variable = False
+    # check for the case where a variable is used instead of a actual value.
+    # use the type in possible_answer as the expected type
+    possible_answer_type = get_possible_answer_type(possible_answer)
+    # if possible_answer only contains optional parameters, we can't determine the type
+    if possible_answer_type != None:
+        # we are being precise here.
+        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
+        if possible_answer_type != expected_type_converted:
+            is_variable = True
+
+    # value is the same type as in function description
+    if type(value) == expected_type_converted:
+        # We don't need to do recursive check for simple types
+        if nested_type_converted == None:
+            result["is_variable"] = is_variable
+            return result
+        else:
+            for possible_answer_item in possible_answer:
+                flag = True  # Each parameter should match to at least one possible answer type.
+                # Here, we assume that each item should be the same type. We could also relax it.
+                if type(possible_answer_item) == list:
+                    for value_item in value:
+                        checker_result = type_checker(
+                            param,
+                            value_item,
+                            possible_answer_item,
+                            str(nested_type_converted),
+                            nested_type_converted,
+                            None,
+                        )
+                        if not checker_result["valid"]:
+                            flag = False
+                            break
+
+                if flag:
+                    return {"valid": True, "error": [], "is_variable": is_variable}
+
+            result["valid"] = False
+            result["error"] = [
+                f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}."
+            ]
+            result["error_type"] = "type_error:nested"
+
+    # value is not as expected, check for the case where a variable is used instead of a actual value
+    # use the type in possible_answer as the expected type
+    possible_answer_type = get_possible_answer_type(possible_answer)
+    # if possible_answer only contains optional parameters, we can't determine the type
+    if possible_answer_type != None:
+        # we are being precise here.
+        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
+        if type(value) == possible_answer_type:
+            result["is_variable"] = True
+            return result
+
+    result["valid"] = False
+    result["error"].append(
+        f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}."
+    )
+    result["error_type"] = "type_error:simple"
+    return result
+
+
+def standardize_string(input_string: str):
+    # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase
+    # It will also convert all the single quotes to double quotes
+    # This is used to compare the model output with the possible answers
+    # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024
+    regex_string = r"[ \,\.\/\-\_\*\^]"
+    return re.sub(regex_string, "", input_string).lower().replace("'", '"')
+
+
+def string_checker(param: str, model_output: str, possible_answer: list):
+    standardize_possible_answer = []
+    standardize_model_output = standardize_string(model_output)
+    for i in range(len(possible_answer)):
+        if type(possible_answer[i]) == str:
+            standardize_possible_answer.append(standardize_string(possible_answer[i]))
+
+    if standardize_model_output not in standardize_possible_answer:
+        return {
+            "valid": False,
+            "error": [
+                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive."
+            ],
+            "error_type": "value_error:string",
+        }
+
+    return {"valid": True, "error": []}
+
+
+def list_checker(param: str, model_output: list, possible_answer: list):
+    # Convert the tuple to a list
+
+    standardize_model_output = list(model_output)
+
+    # If the element in the list is a string, we need to standardize it
+    for i in range(len(standardize_model_output)):
+        if type(standardize_model_output[i]) == str:
+            standardize_model_output[i] = standardize_string(model_output[i])
+
+    standardize_possible_answer: Any = []
+    # We also need to standardize the possible answers
+    for i in range(len(possible_answer)):
+        standardize_possible_answer.append([])
+        for j in range(len(possible_answer[i])):
+            if type(possible_answer[i][j]) == str:
+                standardize_possible_answer[i].append(standardize_string(possible_answer[i][j]))
+            else:
+                standardize_possible_answer[i].append(possible_answer[i][j])
+
+    if standardize_model_output not in standardize_possible_answer:
+        return {
+            "valid": False,
+            "error": [
+                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}."
+            ],
+            "error_type": "value_error:list/tuple",
+        }
+
+    return {"valid": True, "error": []}
+
+
+def dict_checker(param: str, model_output: dict, possible_answers: list):
+    # This function works for simple dictionaries, but not dictionaries with nested dictionaries.
+    # The current dataset only contains simple dictionaries, so this is sufficient.
+
+    result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
+    for i in range(len(possible_answers)):
+        if possible_answers[i] == "":
+            continue
+
+        result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
+
+        flag = True
+
+        possible_answer = possible_answers[i]
+        # possible_anwer is a single dictionary
+
+        for key, value in model_output.items():
+            if key not in possible_answer:
+                result["valid"] = False
+                result["error"].append(f"Unexpected dict key parameter: '{key}'.")  # type: ignore[attr-defined]
+                result["error_type"] = "value_error:dict_key"
+                flag = False
+                break
+
+            standardize_value = value
+            # If the value is a string, we need to standardize it
+            if type(value) == str:
+                standardize_value = standardize_string(value)
+
+            # We also need to standardize the possible answers if they are string
+            standardize_possible_answer = []
+            for i in range(len(possible_answer[key])):
+                if type(possible_answer[key][i]) == str:
+                    standardize_possible_answer.append(standardize_string(possible_answer[key][i]))
+                else:
+                    standardize_possible_answer.append(possible_answer[key][i])
+
+            if standardize_value not in standardize_possible_answer:
+                result["valid"] = False
+                result["error"].append(  # type: ignore[attr-defined]
+                    f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}."
+                )
+                result["error_type"] = "value_error:dict_value"
+                flag = False
+                break
+
+        for key, value in possible_answer.items():
+            if key not in model_output and "" not in value:
+                result["valid"] = False
+                result["error"].append(f"Missing dict key parameter: '{key}'.")  # type: ignore[attr-defined]
+                result["error_type"] = "value_error:dict_key"
+                flag = False
+                break
+
+        if flag:
+            return {"valid": True, "error": []}
+
+    return result
+
+
+def list_dict_checker(param: str, model_output: list, possible_answers: list):
+    # This function takes in a list of dictionaries and checks if each dictionary is valid
+    # The order of the dictionaries in the list must match the order of the possible answers
+
+    result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"}
+
+    for answer_index in range(len(possible_answers)):
+        flag = True  # True means so far, all dictionaries are valid
+
+        # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers
+        if len(model_output) != len(possible_answers[answer_index]):
+            result["valid"] = False
+            result["error"] = ["Wrong number of dictionaries in the list."]
+            result["error_type"] = "value_error:list_dict_count"
+            flag = False
+            continue
+
+        for dict_index in range(len(model_output)):
+            result = dict_checker(
+                param,
+                model_output[dict_index],
+                [possible_answers[answer_index][dict_index]],
+            )
+            if not result["valid"]:
+                flag = False
+                break
+        if flag:
+            return {"valid": True, "error": []}
+
+    return result
+
+
+def simple_function_checker(
+    func_description: dict,
+    model_output: dict,
+    possible_answer: dict,
+    language: str,
+    model_name: str,
+):
+    possible_answer = list(possible_answer.values())[0]
+    # Extract function name and parameters details
+    func_name = func_description["name"]
+    param_details = func_description["parameters"]["properties"]
+    required_params = func_description["parameters"]["required"]
+
+    # Initialize a result dictionary
+    result = {
+        "valid": True,
+        "error": [],
+        "error_type": "simple_function_checker:unclear",
+    }
+
+    # Check if function name matches
+    if func_name not in model_output:
+        result["valid"] = False
+        result["error"].append(  # type: ignore[attr-defined]
+            f"Function name {repr(func_name)} not found in model output."
+        )
+        result["error_type"] = "simple_function_checker:wrong_func_name"
+        return result
+
+    model_params = model_output[func_name]
+
+    # Check for required parameters in model output
+    for param in required_params:
+        if param not in model_params:
+            result["valid"] = False
+            result["error"].append(f"Missing required parameter: {repr(param)}.")  # type: ignore[attr-defined]
+            result["error_type"] = "simple_function_checker:missing_required"
+            return result
+
+    # Validate types and values for each parameter in model output
+    for param, value in model_params.items():
+        if param not in param_details or param not in possible_answer:
+            result["valid"] = False
+            result["error"].append(f"Unexpected parameter: {repr(param)}.")  # type: ignore[attr-defined]
+            result["error_type"] = "simple_function_checker:unexpected_param"
+            return result
+
+        full_param_details = param_details[param]
+        expected_type_description = full_param_details["type"]  # This is a string
+        is_variable = False
+        nested_type_converted = None
+
+        if language == "Java":
+            from evals.utils.bfcl.java_type_converter import java_type_converter
+
+            expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description]
+
+            if expected_type_description in JAVA_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(  # type: ignore[attr-defined]
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:java"
+                    return result
+
+                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
+                    nested_type = param_details[param]["items"]["type"]
+                    nested_type_converted = JAVA_TYPE_CONVERSION[nested_type]
+                    value = java_type_converter(value, expected_type_description, nested_type)
+                else:
+                    value = java_type_converter(value, expected_type_description)
+
+        elif language == "JavaScript":
+            from evals.utils.bfcl.js_type_converter import js_type_converter
+
+            expected_type_converted = JS_TYPE_CONVERSION[expected_type_description]
+
+            if expected_type_description in JS_TYPE_CONVERSION:
+                if type(value) != str:
+                    result["valid"] = False
+                    result["error"].append(  # type: ignore[attr-defined]
+                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
+                    )
+                    result["error_type"] = "type_error:js"
+                    return result
+
+                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
+                    nested_type = param_details[param]["items"]["type"]
+                    nested_type_converted = JS_TYPE_CONVERSION[nested_type]
+                    value = js_type_converter(value, expected_type_description, nested_type)
+                else:
+                    value = js_type_converter(value, expected_type_description)
+
+        elif language == "Python":
+            expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description]
+            if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST:
+                nested_type = param_details[param]["items"]["type"]
+                nested_type_converted = PYTHON_TYPE_MAPPING[nested_type]
+
+        # We convert all tuple value to list when the expected type is tuple.
+        # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load().
+        # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future.
+        if expected_type_description == "tuple" and type(value) == tuple:
+            value = list(value)
+
+        # Allow python auto conversion from int to float
+        if language == "Python" and expected_type_description == "float" and type(value) == int:
+            value = float(value)
+
+        # Type checking
+        # In fact, we only check for Python here.
+        # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct.
+        type_check_result = type_checker(
+            param,
+            value,
+            possible_answer[param],
+            expected_type_description,
+            expected_type_converted,
+            nested_type_converted,
+        )
+        is_variable = type_check_result["is_variable"]
+        if not type_check_result["valid"]:
+            return type_check_result
+
+        # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable.
+        # We can just treat the variable as a string and use the normal flow.
+        if not is_variable:
+            # Special handle for dictionaries
+            if expected_type_converted == dict:
+                result = dict_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            # Special handle for list of dictionaries
+            elif expected_type_converted == list and nested_type_converted == dict:
+                result = list_dict_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            # Special handle for strings
+            elif expected_type_converted == str:
+                # We don't check for case sensitivity for string, as long as it's not a variable
+                result = string_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+            elif expected_type_converted == list:
+                result = list_checker(param, value, possible_answer[param])
+                if not result["valid"]:
+                    return result
+                continue
+
+        # Check if the value is within the possible answers
+        if value not in possible_answer[param]:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}."
+            )
+            result["error_type"] = "value_error:others"
+            return result
+
+    # Check for optional parameters not provided but allowed
+    for param in possible_answer:
+        if param not in model_params and "" not in possible_answer[param]:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Optional parameter {repr(param)} not provided and not marked as optional."
+            )
+            result["error_type"] = "simple_function_checker:missing_optional"
+            return result
+
+    return result
+
+
+def parallel_function_checker_enforce_order(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: dict,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "parallel_function_checker_enforce_order:wrong_count",
+        }
+
+    func_name_list = list(possible_answers.keys())
+    possible_answers_list = []
+
+    for key, value in possible_answers.items():
+        possible_answers_list.append({key: value})
+
+    for i in range(len(possible_answers_list)):
+        func_description = find_description(func_descriptions, func_name_list[i])
+
+        result = simple_function_checker(
+            func_description,
+            model_output[i],
+            possible_answers_list[i],
+            language,
+            model_name,
+        )
+        if not result["valid"]:
+            return result
+
+    return {"valid": True, "error": []}
+
+
+def parallel_function_checker_no_order(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: list,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "parallel_function_checker_no_order:wrong_count",
+        }
+
+    matched_indices = []
+
+    # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer
+    # It must be this way because we need ground truth to fetch the correct function description
+    for i in range(len(possible_answers)):
+        # possible_answers[i] is a dictionary with only one key
+        func_name_expected = list(possible_answers[i].keys())[0]
+        func_description = find_description(func_descriptions, func_name_expected)
+
+        all_errors = []
+
+        for index in range(len(model_output)):
+            if index in matched_indices:
+                continue
+
+            result = simple_function_checker(
+                func_description,
+                model_output[index],
+                possible_answers[i],
+                language,
+                model_name,
+            )
+
+            if result["valid"]:
+                matched_indices.append(index)
+                break
+            else:
+                all_errors.append(
+                    {
+                        f"Model Result Index {index}": {
+                            "sub_error": result["error"],
+                            "sub_error_type": result["error_type"],
+                            "model_output_item": model_output[index],
+                            "possible_answer_item": possible_answers[i],
+                        }
+                    }
+                )
+
+        if not result["valid"]:
+            considered_indices = [i for i in range(len(model_output)) if i not in matched_indices]
+            all_errors.insert(
+                0,
+                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
+            )
+            return {
+                "valid": False,
+                "error": all_errors,
+                "error_type": "parallel_function_checker_no_order:cannot_find_match",
+            }
+
+    return {"valid": True, "error": []}
+
+
+def multiple_function_checker(
+    func_descriptions: list,
+    model_output: list,
+    possible_answers: list,
+    language: str,
+    model_name: str,
+):
+    if len(model_output) != len(possible_answers):
+        return {
+            "valid": False,
+            "error": ["Wrong number of functions."],
+            "error_type": "multiple_function_checker:wrong_count",
+        }
+
+    # possible_answers is a list of only one dictionary with only one key
+    func_name_expected = list(possible_answers[0].keys())[0]
+    func_description = find_description(func_descriptions, func_name_expected)
+    return simple_function_checker(
+        func_description,
+        model_output[0],
+        possible_answers[0],
+        language,
+        model_name,
+    )
+
+
+def patten_matcher(exec_output, expected_result, function_call, is_sanity_check):
+    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+    if type(exec_output) != type(expected_result):
+        return {
+            "valid": False,
+            "error": [
+                f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}."
+            ],
+            "error_type": "executable_checker:wrong_result_type",
+            "model_executed_output": exec_output,
+        }
+    if type(exec_output) == dict:
+        # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one.
+        # This happens when the key is a timestamp or a random number.
+        if is_sanity_check:
+            if len(exec_output) != len(expected_result):
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_length",
+                    "model_executed_output": exec_output,
+                }
+            else:
+                return result
+
+        for key, value in expected_result.items():
+            if key not in exec_output:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_key_not_found",
+                    "model_executed_output": exec_output,
+                }
+        for key, value in exec_output.items():
+            if key not in expected_result:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output."
+                    ],
+                    "error_type": "executable_checker:wrong_result_type:dict_extra_key",
+                    "model_executed_output": exec_output,
+                }
+    if type(exec_output) == list:
+        if len(exec_output) != len(expected_result):
+            return {
+                "valid": False,
+                "error": [
+                    f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
+                ],
+                "error_type": "executable_checker:wrong_result_type:list_length",
+                "model_executed_output": exec_output,
+            }
+    return result
+
+
+#### Helper functions for Exec ####
+def executable_checker_simple(
+    function_call: str,
+    expected_result,
+    expected_result_type: str,
+    is_sanity_check=False,
+):
+    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+    exec_dict: Any = {}
+
+    try:
+        exec(
+            "from executable_python_function import *" + "\nresult=" + function_call,
+            exec_dict,
+        )
+        exec_output = exec_dict["result"]
+    except NoAPIKeyError as e:
+        raise e
+    except Exception as e:
+        result["valid"] = False
+        result["error"].append(  # type: ignore[attr-defined]
+            f"Error in execution: {repr(function_call)}. Error: {str(e)}"
+        )
+        result["error_type"] = "executable_checker:execution_error"
+        return result
+
+    # We need to special handle the case where the execution result is a tuple and convert it to a list
+    # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json
+    if isinstance(exec_output, tuple):
+        exec_output = list(exec_output)
+
+    if expected_result_type == "exact_match":
+        if exec_output != expected_result:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}."
+            )
+            result["error_type"] = "executable_checker:wrong_result"
+            result["model_executed_output"] = exec_output
+            return result
+
+    elif expected_result_type == "real_time_match":
+        # Allow for 5% difference
+        if (type(expected_result) == float or type(expected_result) == int) and (
+            type(exec_output) == float or type(exec_output) == int
+        ):
+            if not (
+                expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
+                <= exec_output
+                <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
+            ):
+                result["valid"] = False
+                result["error"].append(  # type: ignore[attr-defined]
+                    f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed."
+                )
+                result["error_type"] = "executable_checker:wrong_result_real_time"
+                result["model_executed_output"] = exec_output
+                return result
+        else:
+            result["valid"] = False
+            result["error"].append(  # type: ignore[attr-defined]
+                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria."
+            )
+            result["error_type"] = "executable_checker:wrong_result_real_time"
+            result["model_executed_output"] = exec_output
+            return result
+
+    else:
+        # structural match
+        pattern_match_result = patten_matcher(exec_output, expected_result, function_call, is_sanity_check)
+        if not pattern_match_result["valid"]:
+            return pattern_match_result
+
+    return result
+
+
+def executable_checker_parallel_no_order(
+    decoded_result: list, expected_exec_result: list, expected_exec_result_type: list
+):
+    if len(decoded_result) != len(expected_exec_result):
+        return {
+            "valid": False,
+            "error": [
+                f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}."
+            ],
+            "error_type": "value_error:exec_result_count",
+        }
+
+    matched_indices = []
+    for i in range(len(expected_exec_result)):
+        all_errors = []
+        for index in range(len(decoded_result)):
+            if index in matched_indices:
+                continue
+
+            result = executable_checker_simple(
+                decoded_result[index],
+                expected_exec_result[i],
+                expected_exec_result_type[i],
+                False,
+            )
+
+            if result["valid"]:
+                matched_indices.append(index)
+                break
+            else:
+                all_errors.append(
+                    {
+                        f"Model Result Index {index}": {
+                            "sub_error": result["error"],
+                            "sub_error_type": result["error_type"],
+                            "model_executed_output": (
+                                result["model_executed_output"] if "model_executed_output" in result else None
+                            ),
+                        }
+                    }
+                )
+
+        if not result["valid"]:
+            considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices]
+            all_errors.insert(
+                0,
+                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
+            )
+            return {
+                "valid": False,
+                "error": all_errors,
+                "error_type": "executable_checker:cannot_find_match",
+            }
+
+    return {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
+
+
+#### Main function ####
+def executable_checker_rest(func_call, idx):
+    # Move this here for now to avoid needing to read this file / fix paths to be relative to dataset_dir. Fix when it's actually needed / used.
+    EVAL_GROUND_TRUTH_PATH = "/mnt/wsfuse/fair_llm_v2/datasets/eval/bfcl/rest-eval-response_v5.jsonl"  # Ground truth file for v5 for rest execution
+    with open(EVAL_GROUND_TRUTH_PATH, "r") as f:
+        EVAL_GROUND_TRUTH = f.readlines()
+    if "https://geocode.maps.co" in func_call:
+        time.sleep(2)
+    if "requests_get" in func_call:
+        func_call = func_call.replace("requests_get", "requests.get")
+    try:
+        response = eval(func_call)
+    except Exception as e:
+        return {
+            "valid": False,
+            "error": [f"Execution failed. {str(e)}"],
+            "error_type": "executable_checker_rest:execution_error",
+        }
+
+    try:
+        if response.status_code == 200:
+            eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx])
+            try:
+                if isinstance(eval_GT_json, dict):
+                    if isinstance(response.json(), dict):
+                        if set(eval_GT_json.keys()) == set(response.json().keys()):
+                            return {"valid": True, "error": [], "error_type": ""}
+                        return {
+                            "valid": False,
+                            "error": ["Key inconsistency"],
+                            "error_type": "executable_checker_rest:wrong_key",
+                        }
+                    return {
+                        "valid": False,
+                        "error": [f"Expected dictionary, but got {type(response.json())}"],
+                        "error_type": "executable_checker_rest:wrong_type",
+                    }
+
+                elif isinstance(eval_GT_json, list):
+                    if isinstance(response.json(), list):
+                        if len(eval_GT_json) != len(response.json()):
+                            return {
+                                "valid": False,
+                                "error": [f"Response list length inconsistency."],
+                                "error_type": "value_error:exec_result_rest_count",
+                            }
+
+                        else:
+                            for i in range(len(eval_GT_json)):
+                                if set(eval_GT_json[i].keys()) != set(response.json()[i].keys()):
+                                    return {
+                                        "valid": False,
+                                        "error": [f"Key inconsistency"],
+                                        "error_type": "executable_checker_rest:wrong_key",
+                                    }
+
+                            return {"valid": True, "error": []}
+                    else:
+                        return {
+                            "valid": False,
+                            "error": [f"Expected list, but got {type(response.json())}"],
+                            "error_type": "executable_checker_rest:wrong_type",
+                        }
+                return {
+                    "valid": False,
+                    "error": [f"Expected dict or list, but got {type(response.json())}"],
+                    "error_type": "executable_checker_rest:wrong_type",
+                }
+            except Exception as e:
+                return {
+                    "valid": False,
+                    "error": [
+                        f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}"
+                    ],
+                    "error_type": "executable_checker_rest:response_format_error",
+                }
+        else:
+            return {
+                "valid": False,
+                "error": [f"Execution result status code is not 200, got {response.status_code}"],
+                "error_type": "executable_checker_rest:wrong_status_code",
+            }
+    except Exception as e:
+        return {
+            "valid": False,
+            "error": [f"Cannot get status code of the response. Error: {str(e)}"],
+            "error_type": "executable_checker_rest:cannot_get_status_code",
+        }
+
+
+def ast_checker(func_description, model_output, possible_answer, language, test_category, model_name):
+    if "parallel" in test_category:
+        return parallel_function_checker_no_order(func_description, model_output, possible_answer, language, model_name)
+
+    elif "multiple" in test_category:
+        return multiple_function_checker(func_description, model_output, possible_answer, language, model_name)
+
+    else:
+        if len(model_output) != 1:
+            return {
+                "valid": False,
+                "error": ["Wrong number of functions."],
+                "error_type": "simple_function_checker:wrong_count",
+            }
+
+        return simple_function_checker(
+            func_description[0],
+            model_output[0],
+            possible_answer[0],
+            language,
+            model_name,
+        )
+
+
+def exec_checker(decoded_result: list, func_description: dict, test_category: str):
+    if "multiple" in test_category or "parallel" in test_category:
+        return executable_checker_parallel_no_order(
+            decoded_result,
+            func_description["execution_result"],
+            func_description["execution_result_type"],
+        )
+
+    else:
+        if len(decoded_result) != 1:
+            return {
+                "valid": False,
+                "error": ["Wrong number of functions."],
+                "error_type": "simple_exec_checker:wrong_count",
+            }
+        return executable_checker_simple(
+            decoded_result[0],
+            func_description["execution_result"][0],
+            func_description["execution_result_type"][0],
+            False,
+        )
+
+
+def is_empty_output(decoded_output):
+    # This function is a patch to the ast decoder for relevance detection
+    # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call
+    # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct)
+    if not is_function_calling_format_output(decoded_output):
+        return True
+    if len(decoded_output) == 0:
+        return True
+    if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
+        return True
+
+
+def is_function_calling_format_output(decoded_output):
+    # Ensure the output is a list of dictionaries
+    if type(decoded_output) == list:
+        for item in decoded_output:
+            if type(item) != dict:
+                return False
+        return True
+    return False
diff --git a/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
new file mode 100644
index 000000000..ed97ee360
--- /dev/null
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Tree-sitter changes its API with unfortunate frequency. Modules that need it should
+import it from here so that we can centrally manage things as necessary.
+"""
+
+# These currently work with tree-sitter 0.23.0
+# NOTE: Don't import tree-sitter or any of the language modules in the main module
+# because not all environments have them. Import lazily inside functions where needed.
+
+import importlib
+import typing
+
+if typing.TYPE_CHECKING:
+    import tree_sitter
+
+
+def get_language(language: str) -> "tree_sitter.Language":
+    import tree_sitter
+
+    language_module_name = f"tree_sitter_{language}"
+    try:
+        language_module = importlib.import_module(language_module_name)
+    except ModuleNotFoundError as exc:
+        raise ValueError(
+            f"Language {language} is not found. Please install the tree-sitter-{language} package."
+        ) from exc
+    return tree_sitter.Language(language_module.language())
+
+
+def get_parser(language: str, **kwargs) -> "tree_sitter.Parser":
+    import tree_sitter
+
+    lang = get_language(language)
+    return tree_sitter.Parser(lang, **kwargs)
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
index 2e3bd4d3a..42b538876 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/console_span_processor.py
@@ -5,7 +5,7 @@
 # the root directory of this source tree.
 
 import json
-from datetime import datetime
+from datetime import datetime, timezone
 
 from opentelemetry.sdk.trace import ReadableSpan
 from opentelemetry.sdk.trace.export import SpanProcessor
@@ -34,7 +34,7 @@ class ConsoleSpanProcessor(SpanProcessor):
         if span.attributes and span.attributes.get("__autotraced__"):
             return
 
-        timestamp = datetime.utcfromtimestamp(span.start_time / 1e9).strftime("%H:%M:%S.%f")[:-3]
+        timestamp = datetime.fromtimestamp(span.start_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]
 
         print(
             f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
@@ -46,7 +46,7 @@ class ConsoleSpanProcessor(SpanProcessor):
         if span.attributes and span.attributes.get("__autotraced__"):
             return
 
-        timestamp = datetime.utcfromtimestamp(span.end_time / 1e9).strftime("%H:%M:%S.%f")[:-3]
+        timestamp = datetime.fromtimestamp(span.end_time / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]
 
         span_context = (
             f"{COLORS['dim']}{timestamp}{COLORS['reset']} "
@@ -74,7 +74,7 @@ class ConsoleSpanProcessor(SpanProcessor):
                 print(f"    {COLORS['dim']}{key}: {str_value}{COLORS['reset']}")
 
         for event in span.events:
-            event_time = datetime.utcfromtimestamp(event.timestamp / 1e9).strftime("%H:%M:%S.%f")[:-3]
+            event_time = datetime.fromtimestamp(event.timestamp / 1e9, tz=timezone.utc).strftime("%H:%M:%S.%f")[:-3]
 
             severity = event.attributes.get("severity", "info")
             message = event.attributes.get("message", event.name)
diff --git a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
index b39ee7716..5ed586fce 100644
--- a/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
+++ b/llama_stack/providers/inline/telemetry/meta_reference/sqlite_span_processor.py
@@ -8,7 +8,7 @@ import json
 import os
 import sqlite3
 import threading
-from datetime import datetime
+from datetime import datetime, timezone
 
 from opentelemetry.sdk.trace import SpanProcessor
 from opentelemetry.trace import Span
@@ -124,8 +124,8 @@ class SQLiteSpanProcessor(SpanProcessor):
                     trace_id,
                     service_name,
                     (span_id if not parent_span_id else None),
-                    datetime.fromtimestamp(span.start_time / 1e9).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9).isoformat(),
+                    datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(),
+                    datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(),
                 ),
             )
 
@@ -143,8 +143,8 @@ class SQLiteSpanProcessor(SpanProcessor):
                     trace_id,
                     parent_span_id,
                     span.name,
-                    datetime.fromtimestamp(span.start_time / 1e9).isoformat(),
-                    datetime.fromtimestamp(span.end_time / 1e9).isoformat(),
+                    datetime.fromtimestamp(span.start_time / 1e9, timezone.utc).isoformat(),
+                    datetime.fromtimestamp(span.end_time / 1e9, timezone.utc).isoformat(),
                     json.dumps(dict(span.attributes)),
                     span.status.status_code.name,
                     span.kind.name,
@@ -161,7 +161,7 @@ class SQLiteSpanProcessor(SpanProcessor):
                     (
                         span_id,
                         event.name,
-                        datetime.fromtimestamp(event.timestamp / 1e9).isoformat(),
+                        datetime.fromtimestamp(event.timestamp / 1e9, timezone.utc).isoformat(),
                         json.dumps(dict(event.attributes)),
                     ),
                 )
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
index d7b2dbdef..810591c1c 100644
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@@ -168,7 +168,7 @@ def process_matplotlib_response(response, matplotlib_dump_dir: str):
     image_paths = []
     for i, img in enumerate(images):
         # create new directory for each day to better organize data:
-        dump_dname = datetime.today().strftime("%Y-%m-%d")
+        dump_dname = datetime.today().strftime("%Y-%m-%d")  # noqa: DTZ002 - we don't care about timezones here since we are displaying the date
         dump_dpath = Path(matplotlib_dump_dir, dump_dname)
         dump_dpath.mkdir(parents=True, exist_ok=True)
         # save image into a file
diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py
index 6901c3741..755d30382 100644
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]:
         InlineProviderSpec(
             api=Api.eval,
             provider_type="inline::meta-reference",
-            pip_packages=[],
+            pip_packages=["tree_sitter"],
             module="llama_stack.providers.inline.eval.meta_reference",
             config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
             api_dependencies=[
diff --git a/llama_stack/providers/tests/safety/conftest.py b/llama_stack/providers/tests/safety/conftest.py
index 3e8fcaeaa..a466b5dde 100644
--- a/llama_stack/providers/tests/safety/conftest.py
+++ b/llama_stack/providers/tests/safety/conftest.py
@@ -102,3 +102,4 @@ def pytest_generate_tests(metafunc):
             get_provider_fixture_overrides(metafunc.config, available_fixtures) or DEFAULT_PROVIDER_COMBINATIONS
         )
         metafunc.parametrize("safety_stack", combinations, indirect=True)
+
diff --git a/llama_stack/providers/utils/common/data_schema_validator.py b/llama_stack/providers/utils/common/data_schema_validator.py
index 3d14c4148..eb9d9dd60 100644
--- a/llama_stack/providers/utils/common/data_schema_validator.py
+++ b/llama_stack/providers/utils/common/data_schema_validator.py
@@ -23,6 +23,10 @@ class ColumnName(Enum):
     generated_answer = "generated_answer"
     context = "context"
     dialog = "dialog"
+    function = "function"
+    language = "language"
+    id = "id"
+    ground_truth = "ground_truth"
 
 
 VALID_SCHEMAS_FOR_SCORING = [
@@ -37,6 +41,15 @@ VALID_SCHEMAS_FOR_SCORING = [
         ColumnName.generated_answer.value: StringType(),
         ColumnName.context.value: StringType(),
     },
+    {
+        ColumnName.input_query.value: StringType(),
+        ColumnName.expected_answer.value: StringType(),
+        ColumnName.generated_answer.value: StringType(),
+        ColumnName.function.value: StringType(),
+        ColumnName.language.value: StringType(),
+        ColumnName.id.value: StringType(),
+        ColumnName.ground_truth.value: StringType(),
+    },
 ]
 
 VALID_SCHEMAS_FOR_EVAL = [
@@ -50,6 +63,15 @@ VALID_SCHEMAS_FOR_EVAL = [
         ColumnName.expected_answer.value: StringType(),
         ColumnName.completion_input.value: CompletionInputType(),
     },
+    {
+        ColumnName.input_query.value: StringType(),
+        ColumnName.expected_answer.value: StringType(),
+        ColumnName.generated_answer.value: StringType(),
+        ColumnName.function.value: StringType(),
+        ColumnName.language.value: StringType(),
+        ColumnName.id.value: StringType(),
+        ColumnName.ground_truth.value: StringType(),
+    },
 ]
 
 
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index bef229080..607d1a918 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -11,7 +11,7 @@ import logging
 import queue
 import threading
 import uuid
-from datetime import datetime
+from datetime import datetime, timezone
 from functools import wraps
 from typing import Any, Callable, Dict, List, Optional
 
@@ -86,7 +86,7 @@ class TraceContext:
             span_id=generate_short_uuid(),
             trace_id=self.trace_id,
             name=name,
-            start_time=datetime.now(),
+            start_time=datetime.now(timezone.utc),
             parent_span_id=current_span.span_id if current_span else None,
             attributes=attributes,
         )
@@ -203,7 +203,7 @@ class TelemetryHandler(logging.Handler):
             UnstructuredLogEvent(
                 trace_id=span.trace_id,
                 span_id=span.span_id,
-                timestamp=datetime.now(),
+                timestamp=datetime.now(timezone.utc),
                 message=self.format(record),
                 severity=severity(record.levelname),
             )
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index 3abdd82a7..333a82f15 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -26,11 +26,18 @@ providers:
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/faiss_store.db
   safety:
+<<<<<<< HEAD
   - provider_id: nvidia
     provider_type: remote::nvidia
     config:
       guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:http://localhost:7331}
       config_id: self-check
+=======
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+>>>>>>> upstream/main
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -55,6 +62,16 @@ providers:
         namespace: null
         db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db
   datasetio:
+<<<<<<< HEAD
+=======
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/huggingface_datasetio.db
+>>>>>>> upstream/main
   - provider_id: localfs
     provider_type: inline::localfs
     config:
diff --git a/llama_stack/templates/open-benchmark/__init__.py b/llama_stack/templates/open-benchmark/__init__.py
index 14d0a28f5..e9633139b 100644
--- a/llama_stack/templates/open-benchmark/__init__.py
+++ b/llama_stack/templates/open-benchmark/__init__.py
@@ -3,5 +3,8 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+<<<<<<<< HEAD:llama_stack/templates/open-benchmark/__init__.py
 
 from .open_benchmark import get_distribution_template  # noqa: F401
+========
+>>>>>>>> upstream/main:llama_stack/providers/inline/scoring/basic/utils/bfcl/__init__.py
diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py
index 2b40797f9..b671c6175 100644
--- a/llama_stack/templates/open-benchmark/open_benchmark.py
+++ b/llama_stack/templates/open-benchmark/open_benchmark.py
@@ -226,6 +226,25 @@ def get_distribution_template() -> DistributionTemplate:
                 "chat_completion_input": {"type": "string"},
             },
         ),
+<<<<<<< HEAD
+=======
+        DatasetInput(
+            dataset_id="bfcl",
+            provider_id="huggingface",
+            url=URL(uri="https://huggingface.co/datasets/llamastack/bfcl_v3"),
+            metadata={
+                "path": "llamastack/bfcl_v3",
+                "split": "train",
+            },
+            dataset_schema={
+                "function": {"type": "string"},
+                "language": {"type": "string"},
+                "ground_truth": {"type": "string"},
+                "id": {"type": "string"},
+                "chat_completion_input": {"type": "string"},
+            },
+        ),
+>>>>>>> upstream/main
     ]
 
     default_benchmarks = [
@@ -249,6 +268,14 @@ def get_distribution_template() -> DistributionTemplate:
             dataset_id="math_500",
             scoring_functions=["basic::regex_parser_math_response"],
         ),
+<<<<<<< HEAD
+=======
+        BenchmarkInput(
+            benchmark_id="meta-reference-bfcl",
+            dataset_id="bfcl",
+            scoring_functions=["basic::bfcl"],
+        ),
+>>>>>>> upstream/main
     ]
     return DistributionTemplate(
         name=name,
diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml
index 5ef25435b..64b59d300 100644
--- a/llama_stack/templates/open-benchmark/run.yaml
+++ b/llama_stack/templates/open-benchmark/run.yaml
@@ -216,6 +216,27 @@ datasets:
     split: test
   dataset_id: math_500
   provider_id: huggingface
+<<<<<<< HEAD
+=======
+- dataset_schema:
+    function:
+      type: string
+    language:
+      type: string
+    ground_truth:
+      type: string
+    id:
+      type: string
+    chat_completion_input:
+      type: string
+  url:
+    uri: https://huggingface.co/datasets/llamastack/bfcl_v3
+  metadata:
+    path: llamastack/bfcl_v3
+    split: train
+  dataset_id: bfcl
+  provider_id: huggingface
+>>>>>>> upstream/main
 scoring_fns: []
 benchmarks:
 - dataset_id: simpleqa
@@ -238,6 +259,14 @@ benchmarks:
   - basic::regex_parser_math_response
   metadata: {}
   benchmark_id: meta-reference-math-500
+<<<<<<< HEAD
+=======
+- dataset_id: bfcl
+  scoring_functions:
+  - basic::bfcl
+  metadata: {}
+  benchmark_id: meta-reference-bfcl
+>>>>>>> upstream/main
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
diff --git a/llama_stack/templates/passthrough/__init__.py b/llama_stack/templates/passthrough/__init__.py
new file mode 100644
index 000000000..9632c09fb
--- /dev/null
+++ b/llama_stack/templates/passthrough/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .passthrough import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/passthrough/build.yaml b/llama_stack/templates/passthrough/build.yaml
index 5fed5286e..fb1fb1066 100644
--- a/llama_stack/templates/passthrough/build.yaml
+++ b/llama_stack/templates/passthrough/build.yaml
@@ -1,9 +1,10 @@
 version: '2'
 distribution_spec:
-  description: Use for running LLM inference with the endpoint that compatible with Llama Stack API
+  description: Use Passthrough hosted llama-stack endpoint for LLM inference
   providers:
     inference:
     - remote::passthrough
+    - inline::sentence-transformers
     vector_io:
     - inline::faiss
     - remote::chromadb
@@ -26,6 +27,7 @@ distribution_spec:
     tool_runtime:
     - remote::brave-search
     - remote::tavily-search
+    - remote::wolfram-alpha
     - inline::code-interpreter
     - inline::rag-runtime
     - remote::model-context-protocol
diff --git a/llama_stack/templates/passthrough/doc_template.md b/llama_stack/templates/passthrough/doc_template.md
new file mode 100644
index 000000000..f9e88873d
--- /dev/null
+++ b/llama_stack/templates/passthrough/doc_template.md
@@ -0,0 +1,35 @@
+---
+orphan: true
+---
+# Passthrough Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+
+{{ providers_table }}
+
+{% if run_config_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{% if default_models %}
+### Models
+
+The following models are available by default:
+
+{% for model in default_models %}
+- `{{ model.model_id }} {{ model.doc_string }}`
+{% endfor %}
+{% endif %}
diff --git a/llama_stack/templates/passthrough/passthrough.py b/llama_stack/templates/passthrough/passthrough.py
new file mode 100644
index 000000000..cc3f55937
--- /dev/null
+++ b/llama_stack/templates/passthrough/passthrough.py
@@ -0,0 +1,201 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.apis.models.models import ModelType
+from llama_stack.distribution.datatypes import (
+    ModelInput,
+    Provider,
+    ShieldInput,
+    ToolGroupInput,
+)
+from llama_stack.providers.inline.inference.sentence_transformers import (
+    SentenceTransformersInferenceConfig,
+)
+from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
+from llama_stack.providers.remote.inference.passthrough.config import (
+    PassthroughImplConfig,
+)
+from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
+from llama_stack.templates.template import (
+    DistributionTemplate,
+    RunConfigSettings,
+)
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::passthrough", "inline::sentence-transformers"],
+        "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+        "eval": ["inline::meta-reference"],
+        "datasetio": ["remote::huggingface", "inline::localfs"],
+        "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"],
+        "tool_runtime": [
+            "remote::brave-search",
+            "remote::tavily-search",
+            "remote::wolfram-alpha",
+            "inline::code-interpreter",
+            "inline::rag-runtime",
+            "remote::model-context-protocol",
+        ],
+    }
+
+    name = "passthrough"
+
+    inference_provider = Provider(
+        provider_id="passthrough",
+        provider_type="remote::passthrough",
+        config=PassthroughImplConfig.sample_run_config(),
+    )
+    embedding_provider = Provider(
+        provider_id="sentence-transformers",
+        provider_type="inline::sentence-transformers",
+        config=SentenceTransformersInferenceConfig.sample_run_config(),
+    )
+    vector_io_provider = Provider(
+        provider_id="faiss",
+        provider_type="inline::faiss",
+        config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
+
+    default_models = [
+        ModelInput(
+            metadata={},
+            model_id="meta-llama/Llama-3.1-8B-Instruct",
+            provider_id="passthrough",
+            provider_model_id="llama3.1-8b-instruct",
+            model_type=ModelType.llm,
+        ),
+        ModelInput(
+            metadata={},
+            model_id="meta-llama/Llama-3.2-11B-Vision-Instruct",
+            provider_id="passthrough",
+            provider_model_id="llama3.2-11b-vision-instruct",
+            model_type=ModelType.llm,
+        ),
+    ]
+
+    embedding_model = ModelInput(
+        model_id="all-MiniLM-L6-v2",
+        provider_id="sentence-transformers",
+        model_type=ModelType.embedding,
+        metadata={
+            "embedding_dimension": 384,
+        },
+    )
+    default_tool_groups = [
+        ToolGroupInput(
+            toolgroup_id="builtin::websearch",
+            provider_id="tavily-search",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::wolfram_alpha",
+            provider_id="wolfram-alpha",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::rag",
+            provider_id="rag-runtime",
+        ),
+        ToolGroupInput(
+            toolgroup_id="builtin::code_interpreter",
+            provider_id="code-interpreter",
+        ),
+    ]
+
+    return DistributionTemplate(
+        name=name,
+        distro_type="self_hosted",
+        description="Use Passthrough hosted llama-stack endpoint for LLM inference",
+        container_image=None,
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        available_models_by_provider={
+            "passthrough": [
+                ProviderModelEntry(
+                    provider_model_id="llama3.1-8b-instruct",
+                    model_type=ModelType.llm,
+                ),
+                ProviderModelEntry(
+                    provider_model_id="llama3.2-11b-vision-instruct",
+                    model_type=ModelType.llm,
+                ),
+            ],
+        },
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider, embedding_provider],
+                    "vector_io": [vector_io_provider],
+                },
+                default_models=default_models + [embedding_model],
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+                default_tool_groups=default_tool_groups,
+            ),
+            "run-with-safety.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [
+                        inference_provider,
+                        embedding_provider,
+                    ],
+                    "vector_io": [vector_io_provider],
+                    "safety": [
+                        Provider(
+                            provider_id="llama-guard",
+                            provider_type="inline::llama-guard",
+                            config={},
+                        ),
+                        Provider(
+                            provider_id="llama-guard-vision",
+                            provider_type="inline::llama-guard",
+                            config={},
+                        ),
+                        Provider(
+                            provider_id="code-scanner",
+                            provider_type="inline::code-scanner",
+                            config={},
+                        ),
+                    ],
+                },
+                default_models=[
+                    *default_models,
+                    embedding_model,
+                ],
+                default_shields=[
+                    ShieldInput(
+                        shield_id="meta-llama/Llama-Guard-3-8B",
+                        provider_id="llama-guard",
+                    ),
+                    ShieldInput(
+                        shield_id="meta-llama/Llama-Guard-3-11B-Vision",
+                        provider_id="llama-guard-vision",
+                    ),
+                    ShieldInput(
+                        shield_id="CodeScanner",
+                        provider_id="code-scanner",
+                    ),
+                ],
+                default_tool_groups=default_tool_groups,
+            ),
+        },
+        run_config_env_vars={
+            "LLAMA_STACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "PASSTHROUGH_API_KEY": (
+                "",
+                "Passthrough API Key",
+            ),
+            "PASSTHROUGH_URL": (
+                "",
+                "Passthrough URL",
+            ),
+        },
+    )
diff --git a/llama_stack/templates/passthrough/run-with-safety.yaml b/llama_stack/templates/passthrough/run-with-safety.yaml
new file mode 100644
index 000000000..fbfa4afe7
--- /dev/null
+++ b/llama_stack/templates/passthrough/run-with-safety.yaml
@@ -0,0 +1,154 @@
+version: '2'
+image_name: passthrough
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: passthrough
+    provider_type: remote::passthrough
+    config:
+      url: ${env.PASSTHROUGH_URL}
+      api_key: ${env.PASSTHROUGH_API_KEY}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  - provider_id: llama-guard-vision
+    provider_type: inline::llama-guard
+    config: {}
+  - provider_id: code-scanner
+    provider_type: inline::code-scanner
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db}
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/meta_reference_eval.db
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/huggingface_datasetio.db
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/localfs_datasetio.db
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
+  - provider_id: code-interpreter
+    provider_type: inline::code-interpreter
+    config: {}
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/registry.db
+models:
+- metadata: {}
+  model_id: meta-llama/Llama-3.1-8B-Instruct
+  provider_id: passthrough
+  provider_model_id: llama3.1-8b-instruct
+  model_type: llm
+- metadata: {}
+  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  provider_id: passthrough
+  provider_model_id: llama3.2-11b-vision-instruct
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+shields:
+- shield_id: meta-llama/Llama-Guard-3-8B
+  provider_id: llama-guard
+- shield_id: meta-llama/Llama-Guard-3-11B-Vision
+  provider_id: llama-guard-vision
+- shield_id: CodeScanner
+  provider_id: code-scanner
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+- toolgroup_id: builtin::code_interpreter
+  provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/passthrough/run.yaml b/llama_stack/templates/passthrough/run.yaml
index 2548faa5d..6956bc6e3 100644
--- a/llama_stack/templates/passthrough/run.yaml
+++ b/llama_stack/templates/passthrough/run.yaml
@@ -31,7 +31,8 @@ providers:
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
-    config: {}
+    config:
+      excluded_categories: []
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -50,14 +51,26 @@ providers:
   eval:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/meta_reference_eval.db
   datasetio:
   - provider_id: huggingface
     provider_type: remote::huggingface
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/huggingface_datasetio.db
   - provider_id: localfs
     provider_type: inline::localfs
-    config: {}
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/localfs_datasetio.db
   scoring:
   - provider_id: basic
     provider_type: inline::basic
@@ -80,6 +93,10 @@ providers:
     config:
       api_key: ${env.TAVILY_SEARCH_API_KEY:}
       max_results: 3
+  - provider_id: wolfram-alpha
+    provider_type: remote::wolfram-alpha
+    config:
+      api_key: ${env.WOLFRAM_ALPHA_API_KEY:}
   - provider_id: code-interpreter
     provider_type: inline::code-interpreter
     config: {}
@@ -91,7 +108,7 @@ providers:
     config: {}
 metadata_store:
   type: sqlite
-  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-llama}/registry.db
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/registry.db
 models:
 - metadata: {}
   model_id: meta-llama/Llama-3.1-8B-Instruct
@@ -103,15 +120,22 @@ models:
   provider_id: passthrough
   provider_model_id: llama3.2-11b-vision-instruct
   model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
 shields:
 - shield_id: meta-llama/Llama-Guard-3-8B
 vector_dbs: []
 datasets: []
 scoring_fns: []
-eval_tasks: []
+benchmarks: []
 tool_groups:
 - toolgroup_id: builtin::websearch
   provider_id: tavily-search
+- toolgroup_id: builtin::wolfram_alpha
+  provider_id: wolfram-alpha
 - toolgroup_id: builtin::rag
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
diff --git a/pyproject.toml b/pyproject.toml
index 055fa7a55..f96213fc4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -124,14 +124,15 @@ exclude = [
 
 [tool.ruff.lint]
 select = [
-    "B",  # flake8-bugbear
-    "B9", # flake8-bugbear subset
-    "C",  # comprehensions
-    "E",  # pycodestyle
-    "F",  # Pyflakes
-    "N",  # Naming
-    "W",  # Warnings
-    "I",  # isort
+    "B",   # flake8-bugbear
+    "B9",  # flake8-bugbear subset
+    "C",   # comprehensions
+    "E",   # pycodestyle
+    "F",   # Pyflakes
+    "N",   # Naming
+    "W",   # Warnings
+    "I",   # isort
+    "DTZ", # datetime rules
 ]
 ignore = [
     # The following ignores are desired by the project maintainers.
@@ -145,6 +146,10 @@ ignore = [
     "C901", # Complexity of the function is too high
 ]
 
+# Ignore the following errors for the following files
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = ["DTZ"] # Ignore datetime rules for tests
+
 [tool.mypy]
 mypy_path = ["llama_stack"]
 packages = ["llama_stack"]
@@ -170,6 +175,10 @@ exclude = [
     "^llama_stack/apis/inspect/inspect\\.py$",
     "^llama_stack/apis/models/models\\.py$",
     "^llama_stack/apis/post_training/post_training\\.py$",
+<<<<<<< HEAD
+=======
+    "^llama_stack/apis/providers/providers\\.py$",
+>>>>>>> upstream/main
     "^llama_stack/apis/resource\\.py$",
     "^llama_stack/apis/safety/safety\\.py$",
     "^llama_stack/apis/scoring/scoring\\.py$",
diff --git a/requirements.txt b/requirements.txt
index ae8a0af9f..3c382ad84 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ distro==1.9.0
 exceptiongroup==1.2.2 ; python_full_version < '3.11'
 filelock==3.17.0
 fire==0.7.0
-fsspec==2025.2.0
+fsspec==2024.12.0
 h11==0.14.0
 httpcore==1.0.7
 httpx==0.28.1
diff --git a/scripts/unit-tests.sh b/scripts/unit-tests.sh
new file mode 100755
index 000000000..dbc25e06b
--- /dev/null
+++ b/scripts/unit-tests.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+PYTHON_VERSION=${PYTHON_VERSION:-3.10}
+
+command -v uv >/dev/null 2>&1 || { echo >&2 "uv is required but it's not installed. Exiting."; exit 1; }
+
+uv python find $PYTHON_VERSION
+FOUND_PYTHON=$?
+if [ $FOUND_PYTHON -ne 0 ]; then
+     uv python install $PYTHON_VERSION
+fi
+
+uv run --python $PYTHON_VERSION --with-editable . --with-editable ".[dev]" --with-editable ".[unit]" pytest -s -v tests/unit/ $@
diff --git a/tests/integration/agents/test_agents.py b/tests/integration/agents/test_agents.py
index f6bde8927..61249ad17 100644
--- a/tests/integration/agents/test_agents.py
+++ b/tests/integration/agents/test_agents.py
@@ -10,8 +10,7 @@ from uuid import uuid4
 import pytest
 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.lib.agents.event_logger import EventLogger
-from llama_stack_client.types.agents.turn_create_params import Document as AgentDocument
-from llama_stack_client.types.memory_insert_params import Document
+from llama_stack_client.types.agents.turn_create_params import Document
 from llama_stack_client.types.shared_params.agent_config import AgentConfig, ToolConfig
 
 from llama_stack.apis.agents.agents import (
@@ -242,7 +241,7 @@ def test_code_interpreter_for_attachments(llama_stack_client_with_mocked_inferen
 
     codex_agent = Agent(llama_stack_client_with_mocked_inference, **agent_config)
     session_id = codex_agent.create_session(f"test-session-{uuid4()}")
-    inflation_doc = AgentDocument(
+    inflation_doc = Document(
         content="https://raw.githubusercontent.com/meta-llama/llama-stack-apps/main/examples/resources/inflation.csv",
         mime_type="text/csv",
     )
diff --git a/tests/integration/datasetio/test_datasetio.py b/tests/integration/datasetio/test_datasetio.py
index f112071a6..fc81417e6 100644
--- a/tests/integration/datasetio/test_datasetio.py
+++ b/tests/integration/datasetio/test_datasetio.py
@@ -9,11 +9,31 @@ import mimetypes
 import os
 from pathlib import Path
 
+<<<<<<< HEAD
+=======
+import pytest
+
+>>>>>>> upstream/main
 # How to run this test:
 #
 # LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasetio
 
 
+<<<<<<< HEAD
+=======
+@pytest.fixture
+def dataset_for_test(llama_stack_client):
+    dataset_id = "test_dataset"
+    register_dataset(llama_stack_client, dataset_id=dataset_id)
+    yield
+    # Teardown - this always runs, even if the test fails
+    try:
+        llama_stack_client.datasets.unregister(dataset_id)
+    except Exception as e:
+        print(f"Warning: Failed to unregister test_dataset: {e}")
+
+
+>>>>>>> upstream/main
 def data_url_from_file(file_path: str) -> str:
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"File not found: {file_path}")
@@ -80,8 +100,12 @@ def test_register_unregister_dataset(llama_stack_client):
     assert len(response) == 0
 
 
+<<<<<<< HEAD
 def test_get_rows_paginated(llama_stack_client):
     register_dataset(llama_stack_client)
+=======
+def test_get_rows_paginated(llama_stack_client, dataset_for_test):
+>>>>>>> upstream/main
     response = llama_stack_client.datasetio.get_rows_paginated(
         dataset_id="test_dataset",
         rows_in_page=3,
diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py
index e410039e7..1878c9e88 100644
--- a/tests/integration/fixtures/common.py
+++ b/tests/integration/fixtures/common.py
@@ -52,6 +52,8 @@ def llama_stack_client_with_mocked_inference(llama_stack_client, request):
 
     If --record-responses is passed, it will call the real APIs and record the responses.
     """
+    # TODO: will rework this to be more stable
+    return llama_stack_client
     if not isinstance(llama_stack_client, LlamaStackAsLibraryClient):
         logging.warning(
             "llama_stack_client_with_mocked_inference is not supported for this client, returning original client without mocking"
diff --git a/tests/integration/fixtures/recorded_responses/chat_completion.json b/tests/integration/fixtures/recorded_responses/chat_completion.json
index 8694cc271..37bb28ac2 100644
--- a/tests/integration/fixtures/recorded_responses/chat_completion.json
+++ b/tests/integration/fixtures/recorded_responses/chat_completion.json
@@ -12758,6 +12758,131 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " degrees Fahrenheit.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 139
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 23
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 162
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"false\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -12863,6 +12988,207 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"get_boiling_point\", \"parameters\": {\"liquid_name",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": \"polyjuice\", \"celcius\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"false\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "false",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "b0413eb2-f446-4e09-910b-7d8ba4375c87",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 91
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 45
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 136
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -13044,6 +13370,207 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "    \"type\": \"function_call\",\n    \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "name\": \"get_boiling_point\",\n    \"parameters\": {\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "        \"liquid_name\": \"polyjuice\",\n        \"celcius",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\": \"true\"\n    }\n}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "62095a5a-c53c-4850-9f4f-b3a41699a32b",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 43
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 56
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 99
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant Always respond with tool calls no matter what. \", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Get the boiling point of polyjuice with a tool call.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -13370,6 +13897,131 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100\u00b0C",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 85
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 22
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 107
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -13600,6 +14252,131 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice is -100",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "\u00b0C.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 87
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 22
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 109
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"celcius\": \"true\", \"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point_with_metadata\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
     "chunks": [
       {
@@ -13725,6 +14502,458 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\":",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"get_boiling_point\",",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " \"parameters\": {\"liquid_name\": \"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "polyjuice\", \"celcius\": \"true\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "139fe8b9-7bfc-4dcb-ac0d-da1d97257c6e",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point_with_metadata\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_point_with_metadata\", \"parameters\": {\"liquid",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_name\": \"polyjuice\", \"celcius\": \"",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "true\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "celcius": "true",
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "49ab2b64-cbcb-4e71-b02c-99026116c45e",
+                "tool_name": "get_boiling_point_with_metadata"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 37
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 47
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Call get_boiling_point and answer What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -14250,7 +15479,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " customer smiled and said \"hello\" to the friendly store",
+              "text": " customer smiled and said \"hello\" to the",
               "type": "text"
             },
             "event_type": {
@@ -14270,7 +15499,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " clerk.",
+              "text": " friendly store clerk.",
               "type": "text"
             },
             "event_type": {
@@ -17388,7 +18617,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " error message indicates that there is an issue with",
+              "text": " error message indicates that there is an issue with the",
               "type": "text"
             },
             "event_type": {
@@ -17408,7 +18637,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the import statement. However, the code provided does not contain any",
+              "text": " import statement. However, the code provided does",
               "type": "text"
             },
             "event_type": {
@@ -17428,7 +18657,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " import statements that would cause this error.\n\nTo provide a more accurate",
+              "text": " not contain any import statements that would cause this error.\n\nTo provide",
               "type": "text"
             },
             "event_type": {
@@ -17448,7 +18677,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " answer, I would need to know the contents of the CSV file",
+              "text": " a more accurate answer, I would need to know the contents of the",
               "type": "text"
             },
             "event_type": {
@@ -17468,7 +18697,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " or more information about the error message.\n\nHowever, based on the",
+              "text": " CSV file or more information about the error message.\n\nHowever, based on",
               "type": "text"
             },
             "event_type": {
@@ -17488,7 +18717,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " code provided, it seems like the code is trying to load a",
+              "text": " the code provided, it seems like the intention is to load a CSV",
               "type": "text"
             },
             "event_type": {
@@ -17508,7 +18737,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " CSV file and print some basic information about it. If the file",
+              "text": " file and print some basic information about it. If the file is not",
               "type": "text"
             },
             "event_type": {
@@ -17528,7 +18757,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " is not found or there is an issue with the file path,",
+              "text": " found or there is an issue with the file path, this could cause",
               "type": "text"
             },
             "event_type": {
@@ -17548,7 +18777,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " this could cause an error.\n\nHere is a",
+              "text": " an error.\n\nHere is an updated version of the code that includes some",
               "type": "text"
             },
             "event_type": {
@@ -17568,7 +18797,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " revised version of the code that includes some error",
+              "text": " error handling:\n\n```\nimport pandas as pd\n",
               "type": "text"
             },
             "event_type": {
@@ -17588,7 +18817,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " handling:\n\n```\nimport pandas as pd\nimport code_interpreter",
+              "text": "import code_interpreter\n\ntry:\n    #",
               "type": "text"
             },
             "event_type": {
@@ -17608,7 +18837,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\n\ntry:\n    # Load the CSV file",
+              "text": " Load the CSV file\n    df = pd.read_csv(\"/",
               "type": "text"
             },
             "event_type": {
@@ -17628,7 +18857,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\n    df = pd.read_csv(\"/var/folders/cz",
+              "text": "var/folders/cz/vyh7y1d11",
               "type": "text"
             },
             "event_type": {
@@ -17648,7 +18877,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "/vyh7y1d11xg",
+              "text": "xg881lsxsshnc5c0000gn/T/tmpmy",
               "type": "text"
             },
             "event_type": {
@@ -17668,7 +18897,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "881lsxsshnc5",
+              "text": "lybr76/IEQ51fUginflation.csv\")\n\n   ",
               "type": "text"
             },
             "event_type": {
@@ -17688,7 +18917,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "c0000gn/T/tmpflpgiagc/",
+              "text": " # Print the first few rows of the dataframe\n    print(df",
               "type": "text"
             },
             "event_type": {
@@ -17708,7 +18937,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "8S20Zj2Oinflation.csv\")\n\n   ",
+              "text": ".head())\n\n    # Print the data",
               "type": "text"
             },
             "event_type": {
@@ -17728,7 +18957,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " # Print the first few rows of the dataframe\n    print(df.head",
+              "text": " types of each column\n    print(df.dtypes)\n\n    #",
               "type": "text"
             },
             "event_type": {
@@ -17748,7 +18977,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "())\n\n    # Print the data types of each column\n    print",
+              "text": " Print the summary statistics of the dataframe\n",
               "type": "text"
             },
             "event_type": {
@@ -17768,7 +18997,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "(df.dtypes)\n\n    # Print the",
+              "text": "    print(df.describe())\n\nexcept FileNotFoundError:\n    print(\"The",
               "type": "text"
             },
             "event_type": {
@@ -17788,7 +19017,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " summary statistics of the dataframe\n   ",
+              "text": " file was not found.\")\nexcept pd.errors.Empty",
               "type": "text"
             },
             "event_type": {
@@ -17808,7 +19037,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " print(df.describe())\n\nexcept FileNotFoundError:\n    print(\"The file",
+              "text": "DataError:\n    print(\"The file",
               "type": "text"
             },
             "event_type": {
@@ -17828,7 +19057,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " was not found.\")\nexcept pd.errors.EmptyDataError",
+              "text": " is empty.\")\nexcept pd.errors.ParserError:\n",
               "type": "text"
             },
             "event_type": {
@@ -17848,7 +19077,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ":\n    print(\"The file is empty.\")\nexcept pd.errors.ParserError",
+              "text": "    print(\"An error occurred while parsing the file.\")\n",
               "type": "text"
             },
             "event_type": {
@@ -17868,7 +19097,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ":\n    print(\"An error occurred while parsing the",
+              "text": "except Exception as e:\n    print(\"An error occurred: \",",
               "type": "text"
             },
             "event_type": {
@@ -17888,7 +19117,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " file.\")\nexcept Exception as e:\n    print",
+              "text": " str(e))\n```\n\nThis code will",
               "type": "text"
             },
             "event_type": {
@@ -17908,7 +19137,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "(\"An error occurred: \", str(e))\n``",
+              "text": " catch specific exceptions that could occur when loading the CSV file and print",
               "type": "text"
             },
             "event_type": {
@@ -17928,47 +19157,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "`\n\nThis code will catch specific exceptions that could occur when loading the",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " CSV file and print a more",
-              "type": "text"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "text": " informative error message.",
+              "text": " a more informative error message.",
               "type": "text"
             },
             "event_type": {
@@ -18007,17 +19196,17 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 393
+              "value": 389
             },
             {
               "metric": "completion_tokens",
               "unit": null,
-              "value": 331
+              "value": 328
             },
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 724
+              "value": 717
             }
           ]
         }
@@ -18083,7 +19272,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\nimport code_interpreter\n\n",
+              "tool_call": "import pandas as pd\nimport code_interpreter",
               "type": "tool_call"
             },
             "event_type": {
@@ -18108,7 +19297,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "# Load the CSV file\ndf = pd.read_csv(\"/var/f",
+              "tool_call": "\n\n# Load the CSV file",
               "type": "tool_call"
             },
             "event_type": {
@@ -18133,7 +19322,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "olders/cz/vyh7y1d11xg881lsx",
+              "tool_call": "\ndf = pd.read_csv(\"/var/folders/c",
               "type": "tool_call"
             },
             "event_type": {
@@ -18158,7 +19347,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "sshnc5c0000gn/T/tmpfl",
+              "tool_call": "z/vyh7y1d11xg881lsxsshnc",
               "type": "tool_call"
             },
             "event_type": {
@@ -18183,7 +19372,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "pgiagc/8S20Zj2Oinflation",
+              "tool_call": "5c0000gn/T/tmpmylybr76/IE",
               "type": "tool_call"
             },
             "event_type": {
@@ -18208,7 +19397,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": ".csv\")\n\n# Print the first few rows of the",
+              "tool_call": "Q51fUginflation.csv\")\n\n# Print the first few",
               "type": "tool_call"
             },
             "event_type": {
@@ -18233,7 +19422,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " dataframe\nprint(df.head())\n\n# Print the data types of each",
+              "tool_call": " rows of the dataframe\nprint(df.head())\n\n# Print the data",
               "type": "tool_call"
             },
             "event_type": {
@@ -18258,7 +19447,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " column\nprint(df.dtypes)\n\n#",
+              "tool_call": " types of each column\nprint(df.dtypes)\n\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -18283,7 +19472,32 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " Print the summary statistics of the dataframe\nprint(df.describe())",
+              "tool_call": "# Print the summary statistics of the",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " dataframe\nprint(df.describe())",
               "type": "tool_call"
             },
             "event_type": {
@@ -18310,9 +19524,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/IEQ51fUginflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
                 },
-                "call_id": "e999a578-cbd8-4bb8-bc53-deb2fff1ffce",
+                "call_id": "c4c54781-a26e-427d-aea8-6d4b9829bbcc",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -18361,7 +19575,7 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 215
+              "value": 213
             },
             {
               "metric": "completion_tokens",
@@ -18371,7 +19585,7 @@
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 225
+              "value": 223
             }
           ]
         }
@@ -18462,7 +19676,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " CSV file\ndf = pd.read",
+              "tool_call": " CSV file\ndf = pd.read_csv(\"/var/folders/cz/v",
               "type": "tool_call"
             },
             "event_type": {
@@ -18487,7 +19701,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_csv(\"/var/folders/cz/vyh",
+              "tool_call": "yh7y1d11xg881lsx",
               "type": "tool_call"
             },
             "event_type": {
@@ -18512,7 +19726,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "7y1d11xg881lsxsshnc5c",
+              "tool_call": "sshnc5c0000gn/T/tmpmylybr76",
               "type": "tool_call"
             },
             "event_type": {
@@ -18537,7 +19751,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "0000gn/T/tmpflpgiagc/8S",
+              "tool_call": "/IEQ51fUginflation.csv\")\n\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -18562,7 +19776,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "20Zj2Oinflation.csv\")\n\n# Print the first",
+              "tool_call": "# Print the first few rows of the dataframe",
               "type": "tool_call"
             },
             "event_type": {
@@ -18587,7 +19801,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " few rows of the dataframe\nprint(df.head())\n\n#",
+              "tool_call": "\nprint(df.head())\n\n# Print the data types of",
               "type": "tool_call"
             },
             "event_type": {
@@ -18612,7 +19826,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " Print the data types of each column\nprint",
+              "tool_call": " each column\nprint(df.dtypes)\n\n# Print the summary",
               "type": "tool_call"
             },
             "event_type": {
@@ -18637,32 +19851,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "(df.dtypes)\n\n# Print the summary statistics of the dataframe",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "\nprint(df.describe())",
+              "tool_call": " statistics of the dataframe\nprint(df.describe())",
               "type": "tool_call"
             },
             "event_type": {
@@ -18689,9 +19878,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/8S20Zj2Oinflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
+                  "code": "import pandas as pd\nimport code_interpreter\n\n# Load the CSV file\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/IEQ51fUginflation.csv\")\n\n# Print the first few rows of the dataframe\nprint(df.head())\n\n# Print the data types of each column\nprint(df.dtypes)\n\n# Print the summary statistics of the dataframe\nprint(df.describe())"
                 },
-                "call_id": "ea72d524-2d0f-4220-a898-4c295315235e",
+                "call_id": "1f1ed34a-bffb-459d-9f64-eb66d13b2aa5",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -21730,7 +22919,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " code will create a line plot of the average yearly inflation over time. The",
+              "text": " code will create a line plot of the average yearly inflation over",
               "type": "text"
             },
             "event_type": {
@@ -21750,7 +22939,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " x-axis represents the year, and the y-axis represents the average yearly inflation",
+              "text": " time. The x-axis represents the year, and the y-axis",
               "type": "text"
             },
             "event_type": {
@@ -21770,7 +22959,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ". The plot will show the trend of average yearly inflation over the years",
+              "text": " represents the average yearly inflation. The plot will show the trend of",
               "type": "text"
             },
             "event_type": {
@@ -21790,7 +22979,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": ".",
+              "text": " average yearly inflation over the years.",
               "type": "text"
             },
             "event_type": {
@@ -21829,7 +23018,7 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 635
+              "value": 631
             },
             {
               "metric": "completion_tokens",
@@ -21839,7 +23028,7 @@
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 691
+              "value": 687
             }
           ]
         }
@@ -21905,7 +23094,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "import pandas as pd\nimport matplotlib.pyplot as",
+              "tool_call": "import pandas as pd\nimport matplotlib",
               "type": "tool_call"
             },
             "event_type": {
@@ -21930,7 +23119,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " plt\n\n# Load data\ndf = pd.read_csv(\"/var/f",
+              "tool_call": ".pyplot as plt\n\n# Load data",
               "type": "tool_call"
             },
             "event_type": {
@@ -21955,7 +23144,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "olders/cz/vyh7y1d11xg881lsx",
+              "tool_call": "\ndf = pd.read_csv(\"/var/folders/c",
               "type": "tool_call"
             },
             "event_type": {
@@ -21980,7 +23169,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "sshnc5c0000gn/T/tmpflpgiagc/",
+              "tool_call": "z/vyh7y1d11xg881",
               "type": "tool_call"
             },
             "event_type": {
@@ -22005,7 +23194,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation",
+              "tool_call": "lsxsshnc5c0000gn/T/tmpmy",
               "type": "tool_call"
             },
             "event_type": {
@@ -22030,7 +23219,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\ndf['Average'] = df[['Jan', 'Feb', '",
+              "tool_call": "lybr76/Dhwctgpwinflation.csv\")\n\n#",
               "type": "tool_call"
             },
             "event_type": {
@@ -22055,7 +23244,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "Mar', 'Apr', 'May', 'Jun', 'Jul',",
+              "tool_call": " Calculate average yearly inflation\ndf['Average'] = df[['",
               "type": "tool_call"
             },
             "event_type": {
@@ -22080,7 +23269,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " 'Aug', 'Sep', 'Oct', 'Nov', 'Dec",
+              "tool_call": "Jan', 'Feb', 'Mar', 'Apr",
               "type": "tool_call"
             },
             "event_type": {
@@ -22105,7 +23294,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize",
+              "tool_call": "', 'May', 'Jun', 'Jul', '",
               "type": "tool_call"
             },
             "event_type": {
@@ -22130,7 +23319,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "=(10,6))\nplt.plot(df['Year'], df['Average",
+              "tool_call": "Aug', 'Sep', 'Oct', 'Nov', 'Dec",
               "type": "tool_call"
             },
             "event_type": {
@@ -22155,7 +23344,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly",
+              "tool_call": "']].mean(axis=1)\n\n# Plot time series\nplt",
               "type": "tool_call"
             },
             "event_type": {
@@ -22180,7 +23369,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " Inflation')\nplt.title('Average Yearly Inflation Over",
+              "tool_call": ".figure(figsize=(10,6))\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -22205,7 +23394,82 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " Time')\nplt.grid(True)\nplt.show()",
+              "tool_call": "plt.plot(df['Year'], df['Average'])\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "plt.title('Average Yearly Inflation Over Time')\nplt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": ".grid(True)\nplt.show()",
               "type": "tool_call"
             },
             "event_type": {
@@ -22232,9 +23496,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()"
+                  "code": "import pandas as pd\nimport matplotlib.pyplot as plt\n\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/Dhwctgpwinflation.csv\")\n\n# Calculate average yearly inflation\ndf['Average'] = df[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']].mean(axis=1)\n\n# Plot time series\nplt.figure(figsize=(10,6))\nplt.plot(df['Year'], df['Average'])\nplt.xlabel('Year')\nplt.ylabel('Average Yearly Inflation')\nplt.title('Average Yearly Inflation Over Time')\nplt.grid(True)\nplt.show()"
                 },
-                "call_id": "f82fa3fd-e3be-4cb7-9298-8b4625cf709e",
+                "call_id": "73dbb112-a028-48fd-8664-a6c408d1f13d",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -22283,7 +23547,7 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 454
+              "value": 452
             },
             {
               "metric": "completion_tokens",
@@ -22293,7 +23557,7 @@
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 464
+              "value": 462
             }
           ]
         }
@@ -24036,7 +25300,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " CSV file contains 10 rows and 13 columns. The columns are",
+              "text": " CSV file contains 10 rows and 13 columns. The columns",
               "type": "text"
             },
             "event_type": {
@@ -24056,7 +25320,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " named 'Year', 'Jan', 'Feb', 'Mar', '",
+              "text": " are named 'Year', 'Jan', 'Feb', 'Mar",
               "type": "text"
             },
             "event_type": {
@@ -24076,7 +25340,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "Apr', 'May', 'Jun', 'Jul', 'Aug',",
+              "text": "', 'Apr', 'May', 'Jun', 'Jul',",
               "type": "text"
             },
             "event_type": {
@@ -24096,7 +25360,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " 'Sep', 'Oct', 'Nov', 'Dec'. The data",
+              "text": " 'Aug', 'Sep', 'Oct',",
               "type": "text"
             },
             "event_type": {
@@ -24116,7 +25380,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " types of these columns are int64 for 'Year",
+              "text": " 'Nov', 'Dec'. The",
               "type": "text"
             },
             "event_type": {
@@ -24136,7 +25400,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "' and float64 for the rest.\n\nIt appears that this CSV file",
+              "text": " data types of these columns are int64 for",
               "type": "text"
             },
             "event_type": {
@@ -24156,7 +25420,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " contains monthly inflation rates for different years. The 'Year' column represents",
+              "text": " 'Year' and float64 for the rest",
               "type": "text"
             },
             "event_type": {
@@ -24176,7 +25440,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " the year, and the rest of the columns represent the inflation rates",
+              "text": ".\n\nIt appears that this CSV file contains monthly inflation rates for",
               "type": "text"
             },
             "event_type": {
@@ -24196,7 +25460,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " for each month of the",
+              "text": " different years. The 'Year' column represents the year,",
               "type": "text"
             },
             "event_type": {
@@ -24216,7 +25480,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": " year.",
+              "text": " and the rest of the columns represent the inflation rates for each",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " month of the year.",
               "type": "text"
             },
             "event_type": {
@@ -24255,7 +25539,7 @@
             {
               "metric": "prompt_tokens",
               "unit": null,
-              "value": 327
+              "value": 325
             },
             {
               "metric": "completion_tokens",
@@ -24265,7 +25549,7 @@
             {
               "metric": "total_tokens",
               "unit": null,
-              "value": 452
+              "value": 450
             }
           ]
         }
@@ -24356,7 +25640,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "_csv(\"/var/folders/cz/vyh7",
+              "tool_call": "_csv(\"/var/folders/cz/vyh7y1",
               "type": "tool_call"
             },
             "event_type": {
@@ -24381,7 +25665,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "y1d11xg881lsxsshnc5c000",
+              "tool_call": "d11xg881lsxsshnc",
               "type": "tool_call"
             },
             "event_type": {
@@ -24406,7 +25690,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "0gn/T/tmpflpgiagc/2VkeqrPlinflation",
+              "tool_call": "5c0000gn/T/tmpmyly",
               "type": "tool_call"
             },
             "event_type": {
@@ -24431,7 +25715,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": ".csv\")\n# Rows\nprint(\"Number of rows and columns in",
+              "tool_call": "br76/Dhwctgpwinflation.csv\")\n#",
               "type": "tool_call"
             },
             "event_type": {
@@ -24456,7 +25740,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are",
+              "tool_call": " Rows\nprint(\"Number of rows and columns in the data:\",",
               "type": "tool_call"
             },
             "event_type": {
@@ -24481,7 +25765,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": ":\", len(df.columns))\n# Column names\nprint(\"Columns of the data",
+              "tool_call": " df.shape)\n# Columns\nprint(\"Columns of the data are",
               "type": "tool_call"
             },
             "event_type": {
@@ -24506,7 +25790,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of",
+              "tool_call": ":\", len(df.columns))\n# Column names\nprint(\"Columns of",
               "type": "tool_call"
             },
             "event_type": {
@@ -24531,7 +25815,57 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " the columns are:\", df.dtypes)",
+              "tool_call": " the data are:\", df.columns)\n# Column dt",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "ypes\nprint(\"Datatype of the columns",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " are:\", df.dtypes)",
               "type": "tool_call"
             },
             "event_type": {
@@ -24558,9 +25892,9 @@
               },
               "tool_call": {
                 "arguments": {
-                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpflpgiagc/2VkeqrPlinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
+                  "code": "import pandas as pd\n# Load data\ndf = pd.read_csv(\"/var/folders/cz/vyh7y1d11xg881lsxsshnc5c0000gn/T/tmpmylybr76/Dhwctgpwinflation.csv\")\n# Rows\nprint(\"Number of rows and columns in the data:\", df.shape)\n# Columns\nprint(\"Columns of the data are:\", len(df.columns))\n# Column names\nprint(\"Columns of the data are:\", df.columns)\n# Column dtypes\nprint(\"Datatype of the columns are:\", df.dtypes)"
                 },
-                "call_id": "b8aab119-7997-428e-81ab-e6aa163f7acc",
+                "call_id": "f1d86c1d-75bd-43f3-9117-a906e41598f8",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -29279,6 +30613,1666 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c735\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:ef2c1\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4857b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:4857b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:ef2c1\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:ef2c1\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:ef2c1\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " you can follow these steps:\n\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "1.  Install Torchtune and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " its dependencies.\n2.  Download",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the Llama2 weights and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " tokenizer.\n3. ",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Use the `lora_llama2_7",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "b` model in Torchtune, which applies",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " LoRA to the Q and V projections by default.\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "4.  Load the base model weights into the LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " model without any conversion necessary.\n5.  Set only Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA parameters to trainable.\n6.  Run the LoRA",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " finetuning recipe in Torchtune",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with the desired configuration.\n\nYou can also experiment",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with different LoRA configurations, such as applying LoRA to",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " all linear layers in the self-attention,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " increasing the rank, or scaling",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " alpha and rank together.\n\nNote that LoRA can be",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " beneficial for reducing memory usage during fine-tuning",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", but it may also impact model performance. You can trade",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " off memory and model performance by adjusting the LoRA configuration and running",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " experiments with different settings.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 212
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 370
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c735\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:ef2c1\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4857b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:4857b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"knowledge",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_search\", \"parameters\": {\"query\":",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " \"How to use LoRA in Tor",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "chtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "8414f84a-98b1-41eb-90bd-bce084da79eb",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:8c735\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:ef2c1\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:4857b\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:ef2c1\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:4857b\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Tor",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "chtune based on the documentation you provided.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9050a\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:c4e00\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:15efa\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15efa\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:c4e00\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:c4e00\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:c4e00\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "To",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " use LoRA in Torchtune, you can follow these",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " steps:\n\n1.  Install Torchtune and its dependencies",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ".\n2.  Download the Llama2",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " weights and tokenizer.\n3.  Use the `lora",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "_llama2_7b` model in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune, which applies LoRA to the Q and V",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " projections by default.\n4.  Load the base model weights into",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the LoRA model without any conversion necessary.\n5.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "  Set only LoRA parameters to trainable.\n",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "6.  Run the LoRA finetuning recipe in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune with the desired configuration.\n\nYou",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " can also experiment with different LoRA configurations, such as applying Lo",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "RA to all linear layers in the self-attention",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": ", increasing the rank, or scaling alpha and",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " rank together.\n\nNote that LoRA can be beneficial",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " for reducing memory usage during fine-tuning,",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " but it may also impact model",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " performance. You can trade off memory and model performance by",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " adjusting the LoRA configuration and running experiments with different settings.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 158
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 212
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 370
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9050a\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:c4e00\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:15efa\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15efa\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "{\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "type\": \"function\", \"name\": \"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "knowledge_search\", \"parameters\": {\"",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "query\": \"How to use LoRA in",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Torchtune\"}}",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "query": "How to use LoRA in Torchtune"
+                },
+                "call_id": "0784780b-c3dc-4f4a-a37f-e75e83e9be61",
+                "tool_name": "knowledge_search"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 117
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 40
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 157
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:9050a\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:c4e00\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:15efa\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:c4e00\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:15efa\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'m ready to help you answer questions about Torchtune based on the documentation",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " you provided. What's your first question?",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 75
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 35
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 110
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Torchtune documentation\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:a4c57\\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\\n``conversation_column`` and ``conversation_style``. Our data follows the ``\\\"sharegpt\\\"`` format, so\\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\\nlook like so:\\n\\n.. code-block:: python\\n\\n    from torchtune.datasets import chat_dataset\\n    from torchtune.models.llama3 import llama3_tokenizer\\n\\n    tokenizer = llama3_tokenizer(\\\"<TEMP_FILE>\")\\n    ds = chat_dataset(\\n        tokenizer=tokenizer,\\n        source=\\\"json\\\",\\n        data_files=\\\"data/my_data.json\\\",\\n        split=\\\"train\\\",\\n        conversation_column=\\\"dialogue\\\",\\n        conversation_style=\\\"sharegpt\\\",\\n    )\\n\\n.. code-block:: yaml\\n\\n    # In config\\n    tokenizer:\\n      _component_: torchtune.models.llama3.llama3_tokenizer\\n      path: <TEMP_FILE>    dataset:\\n      _component_: torchtune.datasets.chat_dataset\\n      source: json\\n      data_files: data/my_data.json\\n      split: train\\n      conversation_column: dialogue\\n      conversation_style: sharegpt\\n\\n.. note::\\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\\n    Dataset classes and they will honor them. This is useful for common parameters\\n    such as specifying the data split with :code:`split` or configuration with\\n    :code:`name`\\n\\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\\nall messages according to their `recommendations <https://\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:392a8\\nContent: ` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:392a8\\nContent: etune\\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.use_dora=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    use_dora: True\\n\\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\\neven more memory savings!\\n\\n.. code-block:: bash\\n\\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\\\\n  model.apply_lora_to_mlp=True \\\\\\n  model.lora_attn_modules=[\\\"q_proj\\\",\\\"k_proj\\\",\\\"v_proj\\\"] \\\\\\n  model.lora_rank=16 \\\\\\n  model.lora_alpha=32 \\\\\\n  model.use_dora=True \\\\\\n  model.quantize_base=True\\n\\n.. code-block:: yaml\\n\\n  model:\\n    _component_: torchtune.models.lora_llama3_8b\\n    apply_lora_to_mlp: True\\n    lora_attn_modules: [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\"]\\n    lora_rank: 16\\n    lora_alpha: 32\\n    use_dora: True\\n    quantize_base: True\\n\\n\\n.. note::\\n\\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\\n\\n.. _glossary_distrib:\\n\\n\\n.. TODO\\n\\n.. Distributed\\n.. -----------\\n\\n.. .. _glossary_fsdp:\\n\\n.. Fully Sharded Data Parallel (FSDP)\\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\\n.. .. _glossary_fsdp2:\\n\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"I'm ready to help you answer questions about Torchtune based on the documentation you provided. What's your first question?\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": []}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Tell me how to use LoRA\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"How to use LoRA in Torchtune\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"knowledge_search\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": [{\"text\": \"knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n\", \"type\": \"text\"}, {\"text\": \"Result 1:\\nDocument_id:46132\\nContent: .. _lora_finetune_label:\\n\\n============================\\nFine-Tuning Llama2 with LoRA\\n============================\\n\\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\\nIf you already know what LoRA is and want to get straight to running\\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\\n\\n.. grid:: 2\\n\\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\\n\\n      * What LoRA is and how it saves memory during finetuning\\n      * An overview of LoRA components in torchtune\\n      * How to run a LoRA finetune using torchtune\\n      * How to experiment with different LoRA configurations\\n\\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\\n\\n      * Be familiar with :ref:`torchtune<overview_label>`\\n      * Make sure to :ref:`install torchtune<install_label>`\\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\\n\\nWhat is LoRA?\\n-------------\\n\\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\\ntransformer models, in which case it is common to add the low-rank matrices\\nto some of the linear projections in each transformer layer's self-attention.\\n\\n.. note::\\n\\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\\n\\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\\nyou can expect to see memory savings due to a substantial reduction in the\\nnumber of parameters with gradients. When using an optimizer with momentum,\\nlike `AdamW <https://py\\n\", \"type\": \"text\"}, {\"text\": \"Result 2:\\nDocument_id:46132\\nContent:  LoRA to Llama2 models\\n------------------------------\\n\\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\\n\\n.. code-block:: python\\n\\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\\n\\n  # Build Llama2 without any LoRA layers\\n  base_model = llama2_7b()\\n\\n  # The default settings for lora_llama2_7b will match those for llama2_7b\\n  # We just need to define which layers we want LoRA applied to.\\n  # Within each self-attention, we can choose from [\\\"q_proj\\\", \\\"k_proj\\\", \\\"v_proj\\\", and \\\"output_proj\\\"].\\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\\n  # layers outside of the self-attention.\\n  lora_model = lora_llama2_7b(lora_attn_modules=[\\\"q_proj\\\", \\\"v_proj\\\"])\\n\\n.. note::\\n\\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\\n    See :ref:`below<setting_trainable_params>` for how to do this.\\n\\nLet's inspect each of these models a bit more closely.\\n\\n.. code-block:: bash\\n\\n  # Print the first layer's self-attention in the usual Llama2 model\\n  >>> print(base_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\\n    (pos_embeddings): RotaryPositionalEmbeddings()\\n  )\\n\\n  # Print the same for Llama2 with LoRA weights\\n  >>> print(lora_model.layers[0].attn)\\n  MultiHeadAttention(\\n    (q_proj): LoRALinear(\\n      (dropout): Dropout(p=0.0, inplace=False)\\n     \\n\", \"type\": \"text\"}, {\"text\": \"Result 3:\\nDocument_id:46132\\nContent: 06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\\n\\n.. note::\\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \\\"\\\":ref:`config_tutorial_label`\\\" recipe\\n    for more details on how you can easily clone and modify torchtune configs.\\n\\n.. note::\\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the\\n\", \"type\": \"text\"}, {\"text\": \"Result 4:\\nDocument_id:46132\\nContent:  from our Llama2\\nmodel without any wrappers or custom checkpoint conversion logic.\\n\\n.. code-block:: python\\n\\n  # Assuming that base_model already has the pretrained Llama2 weights,\\n  # this will directly load them into your LoRA model without any conversion necessary.\\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\\n\\n.. note::\\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\\n\\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\\n\\n.. _setting_trainable_params:\\n\\n.. code-block:: python\\n\\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\\n\\n  # Fetch all params from the model that are associated with LoRA.\\n  lora_params = get_adapter_params(lora_model)\\n\\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\\n  set_trainable_params(lora_model, lora_params)\\n\\n  # Print the total number of parameters\\n  total_params = sum([p.numel() for p in lora_model.parameters()])\\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\\n  print(\\n    f\\\"\\\"\\\"\\n    {total_params} total params,\\n    {trainable_params}\\\" trainable params,\\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\\n    \\\"\\\"\\\"\\n  )\\n\\n  6742609920 total params,\\n  4194304 trainable params,\\n  0.06% of all params are trainable.\\n\\n.. note::\\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\\n    of in the recipe.\\n\\n\\n.. _lora_recipe_label:\\n\\nLoRA finetuning recipe in torchtune\\n-----------------------------------\\n\\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\\n\", \"type\": \"text\"}, {\"text\": \"Result 5:\\nDocument_id:46132\\nContent: ,\\n    and (b) the memory constraints of your hardware.\\n\\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\\n\\n.. code-block:: yaml\\n\\n  # Model Arguments\\n  model:\\n    _component_: lora_llama2_7b\\n    lora_attn_modules: ['q_proj', 'v_proj']\\n    lora_rank: 8\\n    lora_alpha: 16\\n  ...\\n\\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\\n\\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\\n\\n.. code-block:: bash\\n\\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\\\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\\\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\\n\\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\\n\\n.. image:: /_static/img/lora_experiment_loss_curves.png\\n\\n.. note::\\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\\n    using W&B in torchtune, see our \\\":ref:`wandb_logging`\\\" recipe.\\n\\n.. _lora_tutorial_memory_tradeoff_label:\\n\\nTrading off memory and model performance with LoRA\\n--------------------------------------------------\\n\\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\\non a single device using most commodity GPUs which support `bfloat16 <https://\\n\", \"type\": \"text\"}, {\"text\": \"END of knowledge_search tool results.\\n\", \"type\": \"text\"}], \"role\": \"tool\", \"tool_name\": \"knowledge_search\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search for information in a database.\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for. Can be a natural language sentence or keywords.\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"knowledge_search\"}}]}]": {
     "chunks": [
       {
@@ -33769,7 +36763,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
               "type": "tool_call"
             },
             "event_type": {
@@ -33794,32 +36788,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"Tor",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "chtune documentation\"}}",
+              "tool_call": "\": {\"query\": \"Torchtune documentation\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -33848,7 +36817,7 @@
                 "arguments": {
                   "query": "Torchtune documentation"
                 },
-                "call_id": "cf722fb9-6067-46ea-8534-852b7d364278",
+                "call_id": "7c426640-e3ba-4f25-8c9e-bf9feb88718a",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -34209,7 +37178,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " attention type used by Llama3-8B",
+              "text": " attention type used by Llama3-8B is grouped",
               "type": "text"
             },
             "event_type": {
@@ -34229,7 +37198,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " is grouped-query attention.",
+              "text": "-query attention.",
               "type": "text"
             },
             "event_type": {
@@ -34334,7 +37303,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " attention type used by Llama3-8B",
+              "text": " attention type used by Llama3-8",
               "type": "text"
             },
             "event_type": {
@@ -34354,7 +37323,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " is grouped-query attention.",
+              "text": "B is grouped-query attention.",
               "type": "text"
             },
             "event_type": {
@@ -34459,7 +37428,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"type\": \"function\",\n    \"name\": \"knowledge_search\",\n",
+              "text": "    \"type\": \"function\",\n    \"",
               "type": "text"
             },
             "event_type": {
@@ -34479,7 +37448,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "    \"parameters\": {\n        \"query\": \"L",
+              "text": "name\": \"knowledge_search\",\n    \"parameters\": {\n        \"",
               "type": "text"
             },
             "event_type": {
@@ -34499,7 +37468,27 @@
         "data": {
           "event": {
             "delta": {
-              "text": "lama3-8B attention type\"\n    }\n}",
+              "text": "query\": \"Llama3-8B",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " attention type\"\n    }\n}",
               "type": "text"
             },
             "event_type": {
@@ -34528,7 +37517,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "9106bccf-d0c5-4b0a-9398-0b5972ada295",
+                "call_id": "0a634543-9512-4a3c-b665-3b077996acab",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -34649,7 +37638,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
               "type": "tool_call"
             },
             "event_type": {
@@ -34674,32 +37663,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " \"parameters\": {\"query\": \"Llama3-8",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "B attention type\"}}",
+              "tool_call": "\": {\"query\": \"Llama3-8B attention type\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -34728,7 +37692,7 @@
                 "arguments": {
                   "query": "Llama3-8B attention type"
                 },
-                "call_id": "768fe977-8297-42bd-90c3-b1dc07882ce0",
+                "call_id": "f6cf7afb-20b1-472b-983e-1281fdf6e5ca",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -35518,6 +38482,111 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"query\": \"Meta founder\"}, \"call_id\": \"<UUID>\", \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"{\\\"query\\\": \\\"Meta founder\\\", \\\"top_k\\\": [{\\\"title\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\\\", \\\"content\\\": \\\"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\\\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\\\u00a9 2025 Meta\\\", \\\"score\\\": 0.81595254, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta - Leadership & Governance\\\", \\\"url\\\": \\\"https://investor.atmeta.com/leadership-and-governance/\\\", \\\"content\\\": \\\"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\\\", \\\"score\\\": 0.46759978, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Executives - Meta\\\", \\\"url\\\": \\\"https://about.meta.com/media-gallery/executives/\\\", \\\"content\\\": \\\"Meta leadership: images of senior executives for download to use in articles about the company. ... Mark Zuckerberg, Founder, Chairman and Chief Executive Officer. Nick Clegg, President, Global Affairs. Joel Kaplan, Chief Global Affairs Officer. Susan Li, Chief Financial Officer.\\\", \\\"score\\\": 0.46482924, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Meta Platforms - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Meta_Platforms\\\", \\\"content\\\": \\\"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\\\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\\\", \\\"score\\\": 0.14999175, \\\"raw_content\\\": null}, {\\\"title\\\": \\\"Mark Zuckerberg - Wikipedia\\\", \\\"url\\\": \\\"https://en.wikipedia.org/wiki/Mark_Zuckerberg\\\", \\\"content\\\": \\\"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\\\u9648\\\\u660e\\\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\\\", \\\"score\\\": 0.036911618, \\\"raw_content\\\": null}]}\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " founder of Meta is Mark Zuckerberg.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 1101
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 18
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 1119
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"Search the web and tell me who the founder of Meta is.\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
     "chunks": [
       {
@@ -35576,32 +38645,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "brave_search.call(query",
-              "type": "tool_call"
-            },
-            "event_type": {
-              "__enum__": "ChatCompletionResponseEventType",
-              "__module__": "llama_stack.apis.inference.inference",
-              "value": "progress"
-            },
-            "logprobs": null,
-            "stop_reason": null
-          },
-          "metrics": null
-        }
-      },
-      {
-        "__module__": "llama_stack.apis.inference.inference",
-        "__pydantic__": "ChatCompletionResponseStreamChunk",
-        "data": {
-          "event": {
-            "delta": {
-              "parse_status": {
-                "__enum__": "ToolCallParseStatus",
-                "__module__": "llama_stack.apis.common.content_types",
-                "value": "in_progress"
-              },
-              "tool_call": "=\"Meta founder\")",
+              "tool_call": "brave_search.call(query=\"Meta founder\")",
               "type": "tool_call"
             },
             "event_type": {
@@ -35630,7 +38674,7 @@
                 "arguments": {
                   "query": "Meta founder"
                 },
-                "call_id": "b81c41ae-5eb7-41b7-b466-78eb25a91bb7",
+                "call_id": "a9a452ac-a1a1-4414-a107-4cdc283f4129",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -36172,6 +39216,191 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point`",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " is not able to find the boiling point",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " of polyjuice as it is a",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " fictional liquid from the Harry Potter series. The function is only able",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " to find the boiling point of real liquids.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 56
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 126
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -36357,6 +39586,151 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " boiling point of polyjuice as it is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 38
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
     "chunks": [
       {
@@ -36482,6 +39856,151 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "The",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " function `get_boiling_point` is not able to find",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the boiling point of polyjuice as it is",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " not a real liquid.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 70
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 38
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 108
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"CompletionMessage\", \"data\": {\"content\": \"\", \"role\": \"assistant\", \"stop_reason\": {\"__enum__\": \"StopReason\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"end_of_turn\"}, \"tool_calls\": [{\"arguments\": {\"liquid_name\": \"polyjuice\"}, \"call_id\": \"<UUID>\", \"tool_name\": \"get_boiling_point\"}]}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolResponseMessage\", \"data\": {\"call_id\": \"<UUID>\", \"content\": \"-100\", \"role\": \"tool\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -36627,6 +40146,206 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_boiling",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "918c5630-abc9-4500-ac0b-b630e0743561",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": \"get_boiling_point\", \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -36802,6 +40521,206 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name\": \"get_bo",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "iling_point\", \"parameters\": {\"liquid_name\": \"polyjuice",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "364ad4a8-2e6e-4afb-8c81-1cf98774758a",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"auto\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}, {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Search the web for information\", \"parameters\": {\"query\": {\"default\": null, \"description\": \"The query to search for\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": {\"__enum__\": \"BuiltinTool\", \"__module__\": \"llama_stack.models.llama.datatypes\", \"value\": \"brave_search\"}}}]}]": {
     "chunks": [
       {
@@ -37002,6 +40921,231 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " couldn't find any information on",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " the boiling point of Polyjuice.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Polyjuice is a magical potion in the",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " Harry Potter series that allows the drinker to transform into",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " someone else. It's not a physical substance",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": " with a boiling point. If you have any other questions, I",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "'d be happy to help.",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 73
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 103
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"none\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -37207,6 +41351,206 @@
     ],
     "type": "generator"
   },
+  "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"str\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
+    "chunks": [
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "start"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "started"
+              },
+              "tool_call": "",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "{\"type\": \"function\", \"name",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\": \"get_boiling_point\", \"parameters\": {\"liquid",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "_name\": \"polyjuice\"}}",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "succeeded"
+              },
+              "tool_call": {
+                "arguments": {
+                  "liquid_name": "polyjuice"
+                },
+                "call_id": "b41fafca-4559-4a0a-b49b-f4edf893d08a",
+                "tool_name": "get_boiling_point"
+              },
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "text": "",
+              "type": "text"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "complete"
+            },
+            "logprobs": null,
+            "stop_reason": {
+              "__enum__": "StopReason",
+              "__module__": "llama_stack.models.llama.datatypes",
+              "value": "end_of_turn"
+            }
+          },
+          "metrics": [
+            {
+              "metric": "prompt_tokens",
+              "unit": null,
+              "value": 30
+            },
+            {
+              "metric": "completion_tokens",
+              "unit": null,
+              "value": 10
+            },
+            {
+              "metric": "total_tokens",
+              "unit": null,
+              "value": 40
+            }
+          ]
+        }
+      }
+    ],
+    "type": "generator"
+  },
   "[[\"meta-llama/Llama-3.1-8B-Instruct\", [{\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"SystemMessage\", \"data\": {\"content\": \"You are a helpful assistant\", \"role\": \"system\"}}, {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"UserMessage\", \"data\": {\"content\": \"What is the boiling point of polyjuice?\", \"context\": null, \"role\": \"user\"}}]], {\"response_format\": null, \"sampling_params\": {\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"SamplingParams\", \"data\": {\"max_tokens\": 0, \"repetition_penalty\": 1.0, \"strategy\": {\"temperature\": 0.0001, \"top_p\": 0.9, \"type\": \"top_p\"}}}, \"stream\": true, \"tool_config\": {\"__module__\": \"llama_stack.apis.inference.inference\", \"__pydantic__\": \"ToolConfig\", \"data\": {\"system_message_behavior\": {\"__enum__\": \"SystemMessageBehavior\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"append\"}, \"tool_choice\": {\"__enum__\": \"ToolChoice\", \"__module__\": \"llama_stack.apis.inference.inference\", \"value\": \"required\"}, \"tool_prompt_format\": null}}, \"tool_prompt_format\": null, \"tools\": [{\"__module__\": \"llama_stack.models.llama.datatypes\", \"__pydantic__\": \"ToolDefinition\", \"data\": {\"description\": \"Returns the boiling point of a liquid in Celcius or Fahrenheit\", \"parameters\": {\"celcius\": {\"default\": true, \"description\": \"Whether to return the boiling point in Celcius\", \"param_type\": \"bool\", \"required\": false}, \"liquid_name\": {\"default\": null, \"description\": \"The name of the liquid\", \"param_type\": \"string\", \"required\": true}}, \"tool_name\": \"get_boiling_point\"}}]}]": {
     "chunks": [
       {
@@ -37764,7 +42108,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "def is_prime(n):\n    if n <= 1:\n        return False",
+              "tool_call": "def is_prime(n):\n    if n <= 1:\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -37789,7 +42133,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\n    if n <= 3:\n       ",
+              "tool_call": "        return False\n    if n <= 3:\n        return",
               "type": "tool_call"
             },
             "event_type": {
@@ -37814,7 +42158,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " return True\n    if n % 2 ==",
+              "tool_call": " True\n    if n % 2 == 0 or",
               "type": "tool_call"
             },
             "event_type": {
@@ -37839,7 +42183,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " 0 or n % 3 == 0:\n        return False",
+              "tool_call": " n % 3 == 0",
               "type": "tool_call"
             },
             "event_type": {
@@ -37864,7 +42208,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\n    i = 5\n    while i * i <= n:\n       ",
+              "tool_call": ":\n        return False\n    i = 5\n    while",
               "type": "tool_call"
             },
             "event_type": {
@@ -37889,7 +42233,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " if n % i == 0 or n % (i + 2",
+              "tool_call": " i * i <= n:\n        if n % i == ",
               "type": "tool_call"
             },
             "event_type": {
@@ -37914,7 +42258,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": ") == 0:\n            return False\n        i += 6\n   ",
+              "tool_call": "0 or n % (i + 2) == ",
               "type": "tool_call"
             },
             "event_type": {
@@ -37939,7 +42283,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " return True\n\ndef get_nth_prime(n):\n    count = 0\n   ",
+              "tool_call": "0:\n            return False\n        i",
               "type": "tool_call"
             },
             "event_type": {
@@ -37964,7 +42308,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " num = 2\n    while True:\n        if is_prime(num):\n           ",
+              "tool_call": " += 6\n    return True\n\ndef get_nth_prime(n",
               "type": "tool_call"
             },
             "event_type": {
@@ -37989,7 +42333,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " count += 1\n            if count == n:\n                return num\n",
+              "tool_call": "):\n    count = 0\n",
               "type": "tool_call"
             },
             "event_type": {
@@ -38014,7 +42358,107 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "        num += 1\n\nprint(get_nth_prime(100))",
+              "tool_call": "    num = 2\n    while",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": " True:\n        if is_prime(num):\n            count += 1",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "\n            if count == n:\n                return num\n",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "        num += 1\n\nprint(get_nth_prime",
+              "type": "tool_call"
+            },
+            "event_type": {
+              "__enum__": "ChatCompletionResponseEventType",
+              "__module__": "llama_stack.apis.inference.inference",
+              "value": "progress"
+            },
+            "logprobs": null,
+            "stop_reason": null
+          },
+          "metrics": null
+        }
+      },
+      {
+        "__module__": "llama_stack.apis.inference.inference",
+        "__pydantic__": "ChatCompletionResponseStreamChunk",
+        "data": {
+          "event": {
+            "delta": {
+              "parse_status": {
+                "__enum__": "ToolCallParseStatus",
+                "__module__": "llama_stack.apis.common.content_types",
+                "value": "in_progress"
+              },
+              "tool_call": "(100))",
               "type": "tool_call"
             },
             "event_type": {
@@ -38043,7 +42487,7 @@
                 "arguments": {
                   "code": "def is_prime(n):\n    if n <= 1:\n        return False\n    if n <= 3:\n        return True\n    if n % 2 == 0 or n % 3 == 0:\n        return False\n    i = 5\n    while i * i <= n:\n        if n % i == 0 or n % (i + 2) == 0:\n            return False\n        i += 6\n    return True\n\ndef get_nth_prime(n):\n    count = 0\n    num = 2\n    while True:\n        if is_prime(num):\n            count += 1\n            if count == n:\n                return num\n        num += 1\n\nprint(get_nth_prime(100))"
                 },
-                "call_id": "d8ece88b-7b3e-4f72-9555-5a928c27012c",
+                "call_id": "a1296d7e-6ca3-4056-b43f-19a9663e8bcb",
                 "tool_name": {
                   "__enum__": "BuiltinTool",
                   "__module__": "llama_stack.models.llama.datatypes",
@@ -38548,7 +42992,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "text": "type\": \"function\", \"name\": \"knowledge_search\", \"",
               "type": "text"
             },
             "event_type": {
@@ -38568,7 +43012,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": "\": {\"query\": \"Perplexity company founding",
+              "text": "parameters\": {\"query\": \"Perplexity",
               "type": "text"
             },
             "event_type": {
@@ -38588,7 +43032,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " date\"}}",
+              "text": " company founding date\"}}",
               "type": "text"
             },
             "event_type": {
@@ -38617,7 +43061,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "7f40db23-2182-4006-9234-4c5b7dac978f",
+                "call_id": "75b712aa-fdeb-48bb-be40-c7fcd06242b6",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -38738,7 +43182,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\", \"parameters",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search\",",
               "type": "tool_call"
             },
             "event_type": {
@@ -38763,7 +43207,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "\": {\"query\": \"Perplexity company founding date\"}}",
+              "tool_call": " \"parameters\": {\"query\": \"Perplexity company founding date\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -38792,7 +43236,7 @@
                 "arguments": {
                   "query": "Perplexity company founding date"
                 },
-                "call_id": "7f65affe-6ecb-4db5-b70f-71e05e28c310",
+                "call_id": "3d505e8e-fe35-486e-9661-27f67702621d",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
@@ -39177,7 +43621,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " NBA was created on August 3,",
+              "text": " NBA was created on August 3, 1949, with",
               "type": "text"
             },
             "event_type": {
@@ -39197,7 +43641,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " 1949, with the merger of the Basketball Association of",
+              "text": " the merger of the Basketball Association of America (",
               "type": "text"
             },
             "event_type": {
@@ -39217,7 +43661,7 @@
         "data": {
           "event": {
             "delta": {
-              "text": " America (BAA) and the National Basketball League (NBL",
+              "text": "BAA) and the National Basketball League (NBL",
               "type": "text"
             },
             "event_type": {
@@ -39352,7 +43796,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "{\"type\": \"function\", \"name\": \"",
+              "tool_call": "{\"type\": \"function\", \"name\": \"knowledge_search",
               "type": "tool_call"
             },
             "event_type": {
@@ -39377,7 +43821,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": "knowledge_search\", \"parameters\": {\"query\": \"when",
+              "tool_call": "\", \"parameters\": {\"query\": \"when was the n",
               "type": "tool_call"
             },
             "event_type": {
@@ -39402,7 +43846,7 @@
                 "__module__": "llama_stack.apis.common.content_types",
                 "value": "in_progress"
               },
-              "tool_call": " was the nba created\"}}",
+              "tool_call": "ba created\"}}",
               "type": "tool_call"
             },
             "event_type": {
@@ -39431,7 +43875,7 @@
                 "arguments": {
                   "query": "when was the nba created"
                 },
-                "call_id": "0f4d0151-e44c-443a-8101-e0ac92c9d45f",
+                "call_id": "03ce919a-d1b5-4120-896e-433e79910757",
                 "tool_name": "knowledge_search"
               },
               "type": "tool_call"
diff --git a/tests/integration/fixtures/recorded_responses/invoke_tool.json b/tests/integration/fixtures/recorded_responses/invoke_tool.json
index 8db8ad966..f3a2cfbcb 100644
--- a/tests/integration/fixtures/recorded_responses/invoke_tool.json
+++ b/tests/integration/fixtures/recorded_responses/invoke_tool.json
@@ -167,23 +167,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:15b86\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 1:\nDocument_id:c4e00\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:15b86\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
+            "text": "Result 2:\nDocument_id:c4e00\nContent:  LoRA to Llama2 models\n------------------------------\n\nWith torchtune, we can easily apply LoRA to Llama2 with a variety of different configurations.\nLet's take a look at how to construct Llama2 models in torchtune with and without LoRA.\n\n.. code-block:: python\n\n  from torchtune.models.llama2 import llama2_7b, lora_llama2_7b\n\n  # Build Llama2 without any LoRA layers\n  base_model = llama2_7b()\n\n  # The default settings for lora_llama2_7b will match those for llama2_7b\n  # We just need to define which layers we want LoRA applied to.\n  # Within each self-attention, we can choose from [\"q_proj\", \"k_proj\", \"v_proj\", and \"output_proj\"].\n  # We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear\n  # layers outside of the self-attention.\n  lora_model = lora_llama2_7b(lora_attn_modules=[\"q_proj\", \"v_proj\"])\n\n.. note::\n\n    Calling :func:`lora_llama_2_7b <torchtune.models.llama2.lora_llama2_7b>` alone will not handle the definition of which parameters are trainable.\n    See :ref:`below<setting_trainable_params>` for how to do this.\n\nLet's inspect each of these models a bit more closely.\n\n.. code-block:: bash\n\n  # Print the first layer's self-attention in the usual Llama2 model\n  >>> print(base_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (output_proj): Linear(in_features=4096, out_features=4096, bias=False)\n    (pos_embeddings): RotaryPositionalEmbeddings()\n  )\n\n  # Print the same for Llama2 with LoRA weights\n  >>> print(lora_model.layers[0].attn)\n  MultiHeadAttention(\n    (q_proj): LoRALinear(\n      (dropout): Dropout(p=0.0, inplace=False)\n     \n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 3:\nDocument_id:c4e00\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:15b86\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
+            "text": "Result 4:\nDocument_id:c4e00\nContent:  from our Llama2\nmodel without any wrappers or custom checkpoint conversion logic.\n\n.. code-block:: python\n\n  # Assuming that base_model already has the pretrained Llama2 weights,\n  # this will directly load them into your LoRA model without any conversion necessary.\n  lora_model.load_state_dict(base_model.state_dict(), strict=False)\n\n.. note::\n    Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in\n    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via\n    :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.\n\nOnce we've loaded the base model weights, we also want to set only LoRA parameters to trainable.\n\n.. _setting_trainable_params:\n\n.. code-block:: python\n\n  from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n\n  # Fetch all params from the model that are associated with LoRA.\n  lora_params = get_adapter_params(lora_model)\n\n  # Set requires_grad=True on lora_params, and requires_grad=False on all others.\n  set_trainable_params(lora_model, lora_params)\n\n  # Print the total number of parameters\n  total_params = sum([p.numel() for p in lora_model.parameters()])\n  trainable_params = sum([p.numel() for p in lora_model.parameters() if p.requires_grad])\n  print(\n    f\"\"\"\n    {total_params} total params,\n    {trainable_params}\" trainable params,\n    {(100.0 * trainable_params / total_params):.2f}% of all params are trainable.\n    \"\"\"\n  )\n\n  6742609920 total params,\n  4194304 trainable params,\n  0.06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:15b86\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
+            "text": "Result 5:\nDocument_id:c4e00\nContent: ,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the default is to apply LoRA to Q and V projections with a rank of 8.\nSome experiments with LoRA have found that it can be beneficial to apply LoRA to all linear layers in\nthe self-attention, and to increase the rank to 16 or 32. Note that this is likely to increase our max memory,\nbut as long as we keep :code:`rank<<embed_dim`, the impact should be relatively minor.\n\nLet's run this experiment. We can also increase alpha (in general it is good practice to scale alpha and rank together).\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora \\\n    lora_attn_modules=['q_proj','k_proj','v_proj','output_proj'] \\\n    lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1\n\nA comparison of the (smoothed) loss curves between this run and our baseline over the first 500 steps can be seen below.\n\n.. image:: /_static/img/lora_experiment_loss_curves.png\n\n.. note::\n    The above figure was generated with W&B. You can use torchtune's :class:`~torchtune.training.metric_logging.WandBLogger`\n    to generate similar loss curves, but you will need to install W&B and setup an account separately. For more details on\n    using W&B in torchtune, see our \":ref:`wandb_logging`\" recipe.\n\n.. _lora_tutorial_memory_tradeoff_label:\n\nTrading off memory and model performance with LoRA\n--------------------------------------------------\n\nIn the preceding example, we ran LoRA on two devices. But given LoRA's low memory footprint, we can run fine-tuning\non a single device using most commodity GPUs which support `bfloat16 <https://\n",
             "type": "text"
           },
           {
@@ -195,11 +195,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3"
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19"
           ]
         }
       }
@@ -261,7 +261,7 @@
       "__module__": "llama_stack.apis.tools.tools",
       "__pydantic__": "ToolInvocationResult",
       "data": {
-        "content": "{\"query\": \"Meta founder\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.81595254, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer Joel Kaplan, Chief Global Affairs Officer Susan Li, Chief Financial Officer Javier Olivan, Chief Operating Officer Chris Cox, Chief Product Officer Andrew \\u2018Boz\\u2019 Bosworth, Chief Technology Officer Jennifer Newstead, Chief Legal Officer Dave Wehner, Chief Strategy Officer Will Cathcart, Head of WhatsApp Naomi Gleit, Head of Product John Hegeman, Chief Revenue Officer Adam Mosseri, Head of Instagram Erin Egan, Chief Privacy Officer, Policy Michel Protti, Chief Privacy Officer, Product Alex Schultz, Chief Marketing Officer and VP of Analytics Tom Alison, Head of Facebook Nicola Mendelsohn, Head of Global Business Group Ahmad Al-Dahle, VP and Head of GenAI at Meta Joelle Pineau, Vice President of AI Research and Head of FAIR at Meta\", \"score\": 0.70726365, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.467308, \"raw_content\": null}, {\"title\": \"Meta Platforms - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Meta_Platforms\", \"content\": \"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\", \"score\": 0.14999175, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.03678684, \"raw_content\": null}]}",
+        "content": "{\"query\": \"Meta founder\", \"top_k\": [{\"title\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/mark-zuckerberg/\", \"content\": \"Mark Zuckerberg, Founder, Chairman and Chief Executive Officer | Meta Meta Quest Ray-Ban Meta Meta Horizon Meta AI Meta Verified Meta Pay Meta Horizon Workrooms Meta and you Learn about our community Shop Meta Meta Quest Meta Portal Meta Horizon Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004. In October 2021, Facebook rebranded to Meta to reflect all of its products and services across its family of apps and a focus on developing social experiences for the metaverse \\u2014 moving beyond 2D screens toward immersive experiences like augmented and virtual reality to help build the next evolution in social technology. Shop Ray-Ban Meta glassesRay-Ban StoriesPrivacy informationSupported countries \\u00a9 2025 Meta\", \"score\": 0.81595254, \"raw_content\": null}, {\"title\": \"Meta - Leadership & Governance\", \"url\": \"https://investor.atmeta.com/leadership-and-governance/\", \"content\": \"Mr. Andreessen was a co-founder of Netscape Communications Corporation, a software company, serving in various positions, including Chief Technology Officer and Executive Vice President of Products. Ms. Killefer also served as Assistant Secretary for Management, Chief Financial Officer, and Chief Operating Officer of the U.S. Department of the Treasury from 1997 to 2000 and as a member of the IRS Oversight Board from 2000 to 2005, including as Chair of the IRS Oversight Board from 2002 to 2004. Ms. Travis has served as Executive Vice President and Chief Financial Officer of The Estee Lauder Companies Inc., a global manufacturer and marketer of skin care, makeup, fragrance and hair care products, since August 2012.\", \"score\": 0.46759978, \"raw_content\": null}, {\"title\": \"Executives - Meta\", \"url\": \"https://about.meta.com/media-gallery/executives/\", \"content\": \"Meta leadership: images of senior executives for download to use in articles about the company. ... Mark Zuckerberg, Founder, Chairman and Chief Executive Officer. Nick Clegg, President, Global Affairs. Joel Kaplan, Chief Global Affairs Officer. Susan Li, Chief Financial Officer.\", \"score\": 0.46482924, \"raw_content\": null}, {\"title\": \"Meta Platforms - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Meta_Platforms\", \"content\": \"Following a period of intense scrutiny and damaging whistleblower leaks, news started to emerge on October 21, 2021, about Facebook's plan to rebrand the company and change its name.[15][54] In the Q3 2021 Earnings Call on October 25, Mark Zuckerberg discussed the ongoing criticism of the company's social services and the way it operates, and pointed to the pivoting efforts to building the metaverse \\u2013 without mentioning the rebranding and the name change.[55] The metaverse vision and the name change from Facebook, Inc. to Meta Platforms was introduced at Facebook Connect on October 28, 2021.[16] Based on Facebook's PR campaign, the name change reflects the company's shifting long term focus of building the metaverse, a digital extension of the physical world by social media, virtual reality and augmented reality features.[16][56]\", \"score\": 0.14999175, \"raw_content\": null}, {\"title\": \"Mark Zuckerberg - Wikipedia\", \"url\": \"https://en.wikipedia.org/wiki/Mark_Zuckerberg\", \"content\": \"They began dating in 2003.[175] In September 2010, Chan, who was a medical student at the University of California, San Francisco at the time,[176] moved into his rented house in Palo Alto, California.[177][178] They married on May 19, 2012, in the grounds of his mansion in an event that also celebrated her graduation from medical school.[179][180] Zuckerberg revealed in July 2015 that they were expecting a baby girl and that Chan had previously experienced three miscarriages.[181] Their first daughter was born in December 2015.[182] They announced in a Chinese New Year video that their daughter's Chinese name is Chen Mingyu (Chinese: \\u9648\\u660e\\u5b87).[183] Their second daughter was born in August 2017.[184] Zuckerberg and his wife welcomed their third daughter in March 2023 and announced the news across his social media pages.[185] The couple also have a Puli dog named Beast,[186] who has over two million followers on Facebook.[187] Zuckerberg commissioned the visual artist Daniel Arsham to build a 7-foot-tall sculpture of his wife, which was unveiled in 2024.[188]\", \"score\": 0.036911618, \"raw_content\": null}]}",
         "error_code": null,
         "error_message": null,
         "metadata": null
@@ -400,23 +400,23 @@
             "type": "text"
           },
           {
-            "text": "Result 1:\nDocument_id:bbddb\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
+            "text": "Result 1:\nDocument_id:9050a\nContent:  conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any\ncustom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset\nbuilder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify\n``conversation_column`` and ``conversation_style``. Our data follows the ``\"sharegpt\"`` format, so\nwe can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should\nlook like so:\n\n.. code-block:: python\n\n    from torchtune.datasets import chat_dataset\n    from torchtune.models.llama3 import llama3_tokenizer\n\n    tokenizer = llama3_tokenizer(\"/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\")\n    ds = chat_dataset(\n        tokenizer=tokenizer,\n        source=\"json\",\n        data_files=\"data/my_data.json\",\n        split=\"train\",\n        conversation_column=\"dialogue\",\n        conversation_style=\"sharegpt\",\n    )\n\n.. code-block:: yaml\n\n    # In config\n    tokenizer:\n      _component_: torchtune.models.llama3.llama3_tokenizer\n      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model\n\n    dataset:\n      _component_: torchtune.datasets.chat_dataset\n      source: json\n      data_files: data/my_data.json\n      split: train\n      conversation_column: dialogue\n      conversation_style: sharegpt\n\n.. note::\n    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our\n    Dataset classes and they will honor them. This is useful for common parameters\n    such as specifying the data split with :code:`split` or configuration with\n    :code:`name`\n\nIf you needed to add a prompt template, you would simply pass it into the tokenizer.\nSince we're fine-tuning Llama3, the tokenizer will handle all formatting for\nus and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,\nuse a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format\nall messages according to their `recommendations <https://\n",
             "type": "text"
           },
           {
-            "text": "Result 2:\nDocument_id:15b86\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
+            "text": "Result 2:\nDocument_id:c4e00\nContent: .. _lora_finetune_label:\n\n============================\nFine-Tuning Llama2 with LoRA\n============================\n\nThis guide will teach you about `LoRA <https://arxiv.org/abs/2106.09685>`_, a parameter-efficient finetuning technique,\nand show you how you can use torchtune to finetune a Llama2 model with LoRA.\nIf you already know what LoRA is and want to get straight to running\nyour own LoRA finetune in torchtune, you can jump to :ref:`LoRA finetuning recipe in torchtune<lora_recipe_label>`.\n\n.. grid:: 2\n\n    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn\n\n      * What LoRA is and how it saves memory during finetuning\n      * An overview of LoRA components in torchtune\n      * How to run a LoRA finetune using torchtune\n      * How to experiment with different LoRA configurations\n\n    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites\n\n      * Be familiar with :ref:`torchtune<overview_label>`\n      * Make sure to :ref:`install torchtune<install_label>`\n      * Make sure you have downloaded the :ref:`Llama2-7B model weights<download_llama_label>`\n\nWhat is LoRA?\n-------------\n\n`LoRA <https://arxiv.org/abs/2106.09685>`_ is an adapter-based method for\nparameter-efficient finetuning that adds trainable low-rank decomposition matrices to different layers of a neural network,\nthen freezes the network's remaining parameters. LoRA is most commonly applied to\ntransformer models, in which case it is common to add the low-rank matrices\nto some of the linear projections in each transformer layer's self-attention.\n\n.. note::\n\n    If you're unfamiliar, check out these references for the `definition of rank <https://en.wikipedia.org/wiki/Rank_(linear_algebra)>`_\n    and discussion of `low-rank approximations <https://en.wikipedia.org/wiki/Low-rank_approximation>`_.\n\nBy finetuning with LoRA (as opposed to finetuning all model parameters),\nyou can expect to see memory savings due to a substantial reduction in the\nnumber of parameters with gradients. When using an optimizer with momentum,\nlike `AdamW <https://py\n",
             "type": "text"
           },
           {
-            "text": "Result 3:\nDocument_id:83901\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 3:\nDocument_id:15efa\nContent: ` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
-            "text": "Result 4:\nDocument_id:15b86\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
+            "text": "Result 4:\nDocument_id:c4e00\nContent: 06% of all params are trainable.\n\n.. note::\n    If you are directly using the LoRA recipe (as detailed :ref:`here<lora_recipe_label>`), you need only pass the\n    relevant checkpoint path. Loading model weights and setting trainable parameters will be taken care\n    of in the recipe.\n\n\n.. _lora_recipe_label:\n\nLoRA finetuning recipe in torchtune\n-----------------------------------\n\nFinally, we can put it all together and finetune a model using torchtune's `LoRA recipe <https://github.com/pytorch/torchtune/blob/48626d19d2108f92c749411fbd5f0ff140023a25/recipes/lora_finetune.py>`_.\nMake sure that you have first downloaded the Llama2 weights and tokenizer by following :ref:`these instructions<download_llama_label>`.\nYou can then run the following command to perform a LoRA finetune of Llama2-7B with two GPUs (each having VRAM of at least 16GB):\n\n.. code-block:: bash\n\n    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config llama2/7B_lora\n\n.. note::\n    Make sure to point to the location of your Llama2 weights and tokenizer. This can be done\n    either by adding :code:`checkpointer.checkpoint_files=[my_model_checkpoint_path] tokenizer_checkpoint=my_tokenizer_checkpoint_path`\n    or by directly modifying the :code:`7B_lora.yaml` file. See our \"\":ref:`config_tutorial_label`\" recipe\n    for more details on how you can easily clone and modify torchtune configs.\n\n.. note::\n    You can modify the value of :code:`nproc_per_node` depending on (a) the number of GPUs you have available,\n    and (b) the memory constraints of your hardware.\n\nThe preceding command will run a LoRA finetune with torchtune's factory settings, but we may want to experiment a bit.\nLet's take a closer look at some of the :code:`lora_finetune_distributed` config.\n\n.. code-block:: yaml\n\n  # Model Arguments\n  model:\n    _component_: lora_llama2_7b\n    lora_attn_modules: ['q_proj', 'v_proj']\n    lora_rank: 8\n    lora_alpha: 16\n  ...\n\nWe see that the\n",
             "type": "text"
           },
           {
-            "text": "Result 5:\nDocument_id:83901\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
+            "text": "Result 5:\nDocument_id:15efa\nContent: etune\n:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.use_dora=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    use_dora: True\n\nSince DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap\neven more memory savings!\n\n.. code-block:: bash\n\n  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \\\n  model.apply_lora_to_mlp=True \\\n  model.lora_attn_modules=[\"q_proj\",\"k_proj\",\"v_proj\"] \\\n  model.lora_rank=16 \\\n  model.lora_alpha=32 \\\n  model.use_dora=True \\\n  model.quantize_base=True\n\n.. code-block:: yaml\n\n  model:\n    _component_: torchtune.models.lora_llama3_8b\n    apply_lora_to_mlp: True\n    lora_attn_modules: [\"q_proj\", \"k_proj\", \"v_proj\"]\n    lora_rank: 16\n    lora_alpha: 32\n    use_dora: True\n    quantize_base: True\n\n\n.. note::\n\n   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap\n   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.\n\n.. _glossary_distrib:\n\n\n.. TODO\n\n.. Distributed\n.. -----------\n\n.. .. _glossary_fsdp:\n\n.. Fully Sharded Data Parallel (FSDP)\n.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n.. All our ``_distributed`` recipes use `FSDP <https://pytorch.org/docs/stable/fsdp.html>`.\n.. .. _glossary_fsdp2:\n\n",
             "type": "text"
           },
           {
@@ -428,11 +428,11 @@
         "error_message": null,
         "metadata": {
           "document_ids": [
-            "bbddbe62-508d-4c8d-9455-3b60bc2825a5",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "83901b53-33d4-4f5e-8145-b94c783e9f61",
-            "15b8638f-b1b6-4f58-adfa-eb6644c47de3",
-            "83901b53-33d4-4f5e-8145-b94c783e9f61"
+            "9050ae1c-eba1-4846-b550-2db1957fee7d",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "15efa3d7-f804-4d31-ab05-a5524d82b96a",
+            "c4e00391-aeb8-4d32-ac41-ae3242f38a19",
+            "15efa3d7-f804-4d31-ab05-a5524d82b96a"
           ]
         }
       }
diff --git a/tests/integration/inference/test_vision_inference.py b/tests/integration/inference/test_vision_inference.py
index 984e563d7..9f6fb0478 100644
--- a/tests/integration/inference/test_vision_inference.py
+++ b/tests/integration/inference/test_vision_inference.py
@@ -36,7 +36,7 @@ def test_image_chat_completion_non_streaming(client_with_models, vision_model_id
                 "type": "image",
                 "image": {
                     "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png"
+                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                     },
                 },
             },
@@ -65,7 +65,7 @@ def test_image_chat_completion_streaming(client_with_models, vision_model_id):
                 "type": "image",
                 "image": {
                     "url": {
-                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/api/inference/dog.png"
+                        "uri": "https://raw.githubusercontent.com/meta-llama/llama-stack/main/tests/integration/inference/dog.png"
                     },
                 },
             },
diff --git a/tests/integration/providers/__init__.py b/tests/integration/providers/__init__.py
new file mode 100644
index 000000000..756f351d8
--- /dev/null
+++ b/tests/integration/providers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
diff --git a/tests/integration/providers/test_providers.py b/tests/integration/providers/test_providers.py
new file mode 100644
index 000000000..174d01b5c
--- /dev/null
+++ b/tests/integration/providers/test_providers.py
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import pytest
+from llama_stack_client import LlamaStackClient
+
+from llama_stack import LlamaStackAsLibraryClient
+
+
+class TestProviders:
+    @pytest.mark.asyncio
+    def test_list(self, llama_stack_client: LlamaStackAsLibraryClient | LlamaStackClient):
+        provider_list = llama_stack_client.providers.list()
+        assert provider_list is not None
diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py
index 2fcdf54e2..970a96f40 100644
--- a/tests/integration/scoring/test_scoring.py
+++ b/tests/integration/scoring/test_scoring.py
@@ -10,6 +10,19 @@ import pytest
 from ..datasetio.test_datasetio import register_dataset
 
 
+@pytest.fixture
+def rag_dataset_for_test(llama_stack_client):
+    dataset_id = "test_dataset"
+    register_dataset(llama_stack_client, for_rag=True, dataset_id=dataset_id)
+    yield  # This is where the test function will run
+
+    # Teardown - this always runs, even if the test fails
+    try:
+        llama_stack_client.datasets.unregister(dataset_id)
+    except Exception as e:
+        print(f"Warning: Failed to unregister test_dataset: {e}")
+
+
 @pytest.fixture
 def sample_judge_prompt_template():
     return "Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9."
@@ -79,9 +92,7 @@ def test_scoring_functions_register(
     # TODO: add unregister api for scoring functions
 
 
-def test_scoring_score(llama_stack_client):
-    register_dataset(llama_stack_client, for_rag=True)
-
+def test_scoring_score(llama_stack_client, rag_dataset_for_test):
     # scoring individual rows
     rows = llama_stack_client.datasetio.get_rows_paginated(
         dataset_id="test_dataset",
@@ -115,9 +126,9 @@ def test_scoring_score(llama_stack_client):
         assert len(response.results[x].score_rows) == 5
 
 
-def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge_prompt_template, judge_model_id):
-    register_dataset(llama_stack_client, for_rag=True)
-
+def test_scoring_score_with_params_llm_as_judge(
+    llama_stack_client, sample_judge_prompt_template, judge_model_id, rag_dataset_for_test
+):
     # scoring individual rows
     rows = llama_stack_client.datasetio.get_rows_paginated(
         dataset_id="test_dataset",
@@ -167,9 +178,8 @@ def test_scoring_score_with_params_llm_as_judge(llama_stack_client, sample_judge
     ],
 )
 def test_scoring_score_with_aggregation_functions(
-    llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id
+    llama_stack_client, sample_judge_prompt_template, judge_model_id, provider_id, rag_dataset_for_test
 ):
-    register_dataset(llama_stack_client, for_rag=True)
     rows = llama_stack_client.datasetio.get_rows_paginated(
         dataset_id="test_dataset",
         rows_in_page=3,
diff --git a/uv.lock b/uv.lock
index 9ec3680f8..73b32326d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -701,6 +701,7 @@ sdist = { url = "https://files.pythonhosted.org/packages/6b/b6/82c7e601d6d3c3278
 [[package]]
 name = "frozenlist"
 version = "1.5.0"
+<<<<<<< HEAD
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/8f/ed/0f4cec13a93c02c47ec32d81d11c0c1efbadf4a471e3f3ce7cad366cbbd3/frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817", size = 39930 }
 wheels = [
@@ -770,10 +771,86 @@ wheels = [
 [[package]]
 name = "fsspec"
 version = "2025.2.0"
+=======
+>>>>>>> upstream/main
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b5/79/68612ed99700e6413de42895aa725463e821a6b3be75c87fcce1b4af4c70/fsspec-2025.2.0.tar.gz", hash = "sha256:1c24b16eaa0a1798afa0337aa0db9b256718ab2a89c425371f5628d22c3b6afd", size = 292283 }
+sdist = { url = "https://files.pythonhosted.org/packages/8f/ed/0f4cec13a93c02c47ec32d81d11c0c1efbadf4a471e3f3ce7cad366cbbd3/frozenlist-1.5.0.tar.gz", hash = "sha256:81d5af29e61b9c8348e876d442253723928dce6433e0e76cd925cd83f1b4b817", size = 39930 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e2/94/758680531a00d06e471ef649e4ec2ed6bf185356a7f9fbfbb7368a40bd49/fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b", size = 184484 },
+    { url = "https://files.pythonhosted.org/packages/54/79/29d44c4af36b2b240725dce566b20f63f9b36ef267aaaa64ee7466f4f2f8/frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a", size = 94451 },
+    { url = "https://files.pythonhosted.org/packages/47/47/0c999aeace6ead8a44441b4f4173e2261b18219e4ad1fe9a479871ca02fc/frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb", size = 54301 },
+    { url = "https://files.pythonhosted.org/packages/8d/60/107a38c1e54176d12e06e9d4b5d755b677d71d1219217cee063911b1384f/frozenlist-1.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:15538c0cbf0e4fa11d1e3a71f823524b0c46299aed6e10ebb4c2089abd8c3bec", size = 52213 },
+    { url = "https://files.pythonhosted.org/packages/17/62/594a6829ac5679c25755362a9dc93486a8a45241394564309641425d3ff6/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79225373c317ff1e35f210dd5f1344ff31066ba8067c307ab60254cd3a78ad5", size = 240946 },
+    { url = "https://files.pythonhosted.org/packages/7e/75/6c8419d8f92c80dd0ee3f63bdde2702ce6398b0ac8410ff459f9b6f2f9cb/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9272fa73ca71266702c4c3e2d4a28553ea03418e591e377a03b8e3659d94fa76", size = 264608 },
+    { url = "https://files.pythonhosted.org/packages/88/3e/82a6f0b84bc6fb7e0be240e52863c6d4ab6098cd62e4f5b972cd31e002e8/frozenlist-1.5.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:498524025a5b8ba81695761d78c8dd7382ac0b052f34e66939c42df860b8ff17", size = 261361 },
+    { url = "https://files.pythonhosted.org/packages/fd/85/14e5f9ccac1b64ff2f10c927b3ffdf88772aea875882406f9ba0cec8ad84/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92b5278ed9d50fe610185ecd23c55d8b307d75ca18e94c0e7de328089ac5dcba", size = 231649 },
+    { url = "https://files.pythonhosted.org/packages/ee/59/928322800306f6529d1852323014ee9008551e9bb027cc38d276cbc0b0e7/frozenlist-1.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f3c8c1dacd037df16e85227bac13cca58c30da836c6f936ba1df0c05d046d8d", size = 241853 },
+    { url = "https://files.pythonhosted.org/packages/7d/bd/e01fa4f146a6f6c18c5d34cab8abdc4013774a26c4ff851128cd1bd3008e/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f2ac49a9bedb996086057b75bf93538240538c6d9b38e57c82d51f75a73409d2", size = 243652 },
+    { url = "https://files.pythonhosted.org/packages/a5/bd/e4771fd18a8ec6757033f0fa903e447aecc3fbba54e3630397b61596acf0/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e66cc454f97053b79c2ab09c17fbe3c825ea6b4de20baf1be28919460dd7877f", size = 241734 },
+    { url = "https://files.pythonhosted.org/packages/21/13/c83821fa5544af4f60c5d3a65d054af3213c26b14d3f5f48e43e5fb48556/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5a3ba5f9a0dfed20337d3e966dc359784c9f96503674c2faf015f7fe8e96798c", size = 260959 },
+    { url = "https://files.pythonhosted.org/packages/71/f3/1f91c9a9bf7ed0e8edcf52698d23f3c211d8d00291a53c9f115ceb977ab1/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6321899477db90bdeb9299ac3627a6a53c7399c8cd58d25da094007402b039ab", size = 262706 },
+    { url = "https://files.pythonhosted.org/packages/4c/22/4a256fdf5d9bcb3ae32622c796ee5ff9451b3a13a68cfe3f68e2c95588ce/frozenlist-1.5.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:76e4753701248476e6286f2ef492af900ea67d9706a0155335a40ea21bf3b2f5", size = 250401 },
+    { url = "https://files.pythonhosted.org/packages/af/89/c48ebe1f7991bd2be6d5f4ed202d94960c01b3017a03d6954dd5fa9ea1e8/frozenlist-1.5.0-cp310-cp310-win32.whl", hash = "sha256:977701c081c0241d0955c9586ffdd9ce44f7a7795df39b9151cd9a6fd0ce4cfb", size = 45498 },
+    { url = "https://files.pythonhosted.org/packages/28/2f/cc27d5f43e023d21fe5c19538e08894db3d7e081cbf582ad5ed366c24446/frozenlist-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:189f03b53e64144f90990d29a27ec4f7997d91ed3d01b51fa39d2dbe77540fd4", size = 51622 },
+    { url = "https://files.pythonhosted.org/packages/79/43/0bed28bf5eb1c9e4301003b74453b8e7aa85fb293b31dde352aac528dafc/frozenlist-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:fd74520371c3c4175142d02a976aee0b4cb4a7cc912a60586ffd8d5929979b30", size = 94987 },
+    { url = "https://files.pythonhosted.org/packages/bb/bf/b74e38f09a246e8abbe1e90eb65787ed745ccab6eaa58b9c9308e052323d/frozenlist-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2f3f7a0fbc219fb4455264cae4d9f01ad41ae6ee8524500f381de64ffaa077d5", size = 54584 },
+    { url = "https://files.pythonhosted.org/packages/2c/31/ab01375682f14f7613a1ade30149f684c84f9b8823a4391ed950c8285656/frozenlist-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f47c9c9028f55a04ac254346e92977bf0f166c483c74b4232bee19a6697e4778", size = 52499 },
+    { url = "https://files.pythonhosted.org/packages/98/a8/d0ac0b9276e1404f58fec3ab6e90a4f76b778a49373ccaf6a563f100dfbc/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0996c66760924da6e88922756d99b47512a71cfd45215f3570bf1e0b694c206a", size = 276357 },
+    { url = "https://files.pythonhosted.org/packages/ad/c9/c7761084fa822f07dac38ac29f841d4587570dd211e2262544aa0b791d21/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2fe128eb4edeabe11896cb6af88fca5346059f6c8d807e3b910069f39157869", size = 287516 },
+    { url = "https://files.pythonhosted.org/packages/a1/ff/cd7479e703c39df7bdab431798cef89dc75010d8aa0ca2514c5b9321db27/frozenlist-1.5.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a8ea951bbb6cacd492e3948b8da8c502a3f814f5d20935aae74b5df2b19cf3d", size = 283131 },
+    { url = "https://files.pythonhosted.org/packages/59/a0/370941beb47d237eca4fbf27e4e91389fd68699e6f4b0ebcc95da463835b/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de537c11e4aa01d37db0d403b57bd6f0546e71a82347a97c6a9f0dcc532b3a45", size = 261320 },
+    { url = "https://files.pythonhosted.org/packages/b8/5f/c10123e8d64867bc9b4f2f510a32042a306ff5fcd7e2e09e5ae5100ee333/frozenlist-1.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c2623347b933fcb9095841f1cc5d4ff0b278addd743e0e966cb3d460278840d", size = 274877 },
+    { url = "https://files.pythonhosted.org/packages/fa/79/38c505601ae29d4348f21706c5d89755ceded02a745016ba2f58bd5f1ea6/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cee6798eaf8b1416ef6909b06f7dc04b60755206bddc599f52232606e18179d3", size = 269592 },
+    { url = "https://files.pythonhosted.org/packages/19/e2/39f3a53191b8204ba9f0bb574b926b73dd2efba2a2b9d2d730517e8f7622/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f5f9da7f5dbc00a604fe74aa02ae7c98bcede8a3b8b9666f9f86fc13993bc71a", size = 265934 },
+    { url = "https://files.pythonhosted.org/packages/d5/c9/3075eb7f7f3a91f1a6b00284af4de0a65a9ae47084930916f5528144c9dd/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:90646abbc7a5d5c7c19461d2e3eeb76eb0b204919e6ece342feb6032c9325ae9", size = 283859 },
+    { url = "https://files.pythonhosted.org/packages/05/f5/549f44d314c29408b962fa2b0e69a1a67c59379fb143b92a0a065ffd1f0f/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:bdac3c7d9b705d253b2ce370fde941836a5f8b3c5c2b8fd70940a3ea3af7f4f2", size = 287560 },
+    { url = "https://files.pythonhosted.org/packages/9d/f8/cb09b3c24a3eac02c4c07a9558e11e9e244fb02bf62c85ac2106d1eb0c0b/frozenlist-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:03d33c2ddbc1816237a67f66336616416e2bbb6beb306e5f890f2eb22b959cdf", size = 277150 },
+    { url = "https://files.pythonhosted.org/packages/37/48/38c2db3f54d1501e692d6fe058f45b6ad1b358d82cd19436efab80cfc965/frozenlist-1.5.0-cp311-cp311-win32.whl", hash = "sha256:237f6b23ee0f44066219dae14c70ae38a63f0440ce6750f868ee08775073f942", size = 45244 },
+    { url = "https://files.pythonhosted.org/packages/ca/8c/2ddffeb8b60a4bce3b196c32fcc30d8830d4615e7b492ec2071da801b8ad/frozenlist-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:0cc974cc93d32c42e7b0f6cf242a6bd941c57c61b618e78b6c0a96cb72788c1d", size = 51634 },
+    { url = "https://files.pythonhosted.org/packages/79/73/fa6d1a96ab7fd6e6d1c3500700963eab46813847f01ef0ccbaa726181dd5/frozenlist-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:31115ba75889723431aa9a4e77d5f398f5cf976eea3bdf61749731f62d4a4a21", size = 94026 },
+    { url = "https://files.pythonhosted.org/packages/ab/04/ea8bf62c8868b8eada363f20ff1b647cf2e93377a7b284d36062d21d81d1/frozenlist-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7437601c4d89d070eac8323f121fcf25f88674627505334654fd027b091db09d", size = 54150 },
+    { url = "https://files.pythonhosted.org/packages/d0/9a/8e479b482a6f2070b26bda572c5e6889bb3ba48977e81beea35b5ae13ece/frozenlist-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7948140d9f8ece1745be806f2bfdf390127cf1a763b925c4a805c603df5e697e", size = 51927 },
+    { url = "https://files.pythonhosted.org/packages/e3/12/2aad87deb08a4e7ccfb33600871bbe8f0e08cb6d8224371387f3303654d7/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feeb64bc9bcc6b45c6311c9e9b99406660a9c05ca8a5b30d14a78555088b0b3a", size = 282647 },
+    { url = "https://files.pythonhosted.org/packages/77/f2/07f06b05d8a427ea0060a9cef6e63405ea9e0d761846b95ef3fb3be57111/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:683173d371daad49cffb8309779e886e59c2f369430ad28fe715f66d08d4ab1a", size = 289052 },
+    { url = "https://files.pythonhosted.org/packages/bd/9f/8bf45a2f1cd4aa401acd271b077989c9267ae8463e7c8b1eb0d3f561b65e/frozenlist-1.5.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7d57d8f702221405a9d9b40f9da8ac2e4a1a8b5285aac6100f3393675f0a85ee", size = 291719 },
+    { url = "https://files.pythonhosted.org/packages/41/d1/1f20fd05a6c42d3868709b7604c9f15538a29e4f734c694c6bcfc3d3b935/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30c72000fbcc35b129cb09956836c7d7abf78ab5416595e4857d1cae8d6251a6", size = 267433 },
+    { url = "https://files.pythonhosted.org/packages/af/f2/64b73a9bb86f5a89fb55450e97cd5c1f84a862d4ff90d9fd1a73ab0f64a5/frozenlist-1.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000a77d6034fbad9b6bb880f7ec073027908f1b40254b5d6f26210d2dab1240e", size = 283591 },
+    { url = "https://files.pythonhosted.org/packages/29/e2/ffbb1fae55a791fd6c2938dd9ea779509c977435ba3940b9f2e8dc9d5316/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5d7f5a50342475962eb18b740f3beecc685a15b52c91f7d975257e13e029eca9", size = 273249 },
+    { url = "https://files.pythonhosted.org/packages/2e/6e/008136a30798bb63618a114b9321b5971172a5abddff44a100c7edc5ad4f/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:87f724d055eb4785d9be84e9ebf0f24e392ddfad00b3fe036e43f489fafc9039", size = 271075 },
+    { url = "https://files.pythonhosted.org/packages/ae/f0/4e71e54a026b06724cec9b6c54f0b13a4e9e298cc8db0f82ec70e151f5ce/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:6e9080bb2fb195a046e5177f10d9d82b8a204c0736a97a153c2466127de87784", size = 285398 },
+    { url = "https://files.pythonhosted.org/packages/4d/36/70ec246851478b1c0b59f11ef8ade9c482ff447c1363c2bd5fad45098b12/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9b93d7aaa36c966fa42efcaf716e6b3900438632a626fb09c049f6a2f09fc631", size = 294445 },
+    { url = "https://files.pythonhosted.org/packages/37/e0/47f87544055b3349b633a03c4d94b405956cf2437f4ab46d0928b74b7526/frozenlist-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:52ef692a4bc60a6dd57f507429636c2af8b6046db8b31b18dac02cbc8f507f7f", size = 280569 },
+    { url = "https://files.pythonhosted.org/packages/f9/7c/490133c160fb6b84ed374c266f42800e33b50c3bbab1652764e6e1fc498a/frozenlist-1.5.0-cp312-cp312-win32.whl", hash = "sha256:29d94c256679247b33a3dc96cce0f93cbc69c23bf75ff715919332fdbb6a32b8", size = 44721 },
+    { url = "https://files.pythonhosted.org/packages/b1/56/4e45136ffc6bdbfa68c29ca56ef53783ef4c2fd395f7cbf99a2624aa9aaa/frozenlist-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:8969190d709e7c48ea386db202d708eb94bdb29207a1f269bab1196ce0dcca1f", size = 51329 },
+    { url = "https://files.pythonhosted.org/packages/da/3b/915f0bca8a7ea04483622e84a9bd90033bab54bdf485479556c74fd5eaf5/frozenlist-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:7a1a048f9215c90973402e26c01d1cff8a209e1f1b53f72b95c13db61b00f953", size = 91538 },
+    { url = "https://files.pythonhosted.org/packages/c7/d1/a7c98aad7e44afe5306a2b068434a5830f1470675f0e715abb86eb15f15b/frozenlist-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dd47a5181ce5fcb463b5d9e17ecfdb02b678cca31280639255ce9d0e5aa67af0", size = 52849 },
+    { url = "https://files.pythonhosted.org/packages/3a/c8/76f23bf9ab15d5f760eb48701909645f686f9c64fbb8982674c241fbef14/frozenlist-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1431d60b36d15cda188ea222033eec8e0eab488f39a272461f2e6d9e1a8e63c2", size = 50583 },
+    { url = "https://files.pythonhosted.org/packages/1f/22/462a3dd093d11df623179d7754a3b3269de3b42de2808cddef50ee0f4f48/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6482a5851f5d72767fbd0e507e80737f9c8646ae7fd303def99bfe813f76cf7f", size = 265636 },
+    { url = "https://files.pythonhosted.org/packages/80/cf/e075e407fc2ae7328155a1cd7e22f932773c8073c1fc78016607d19cc3e5/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44c49271a937625619e862baacbd037a7ef86dd1ee215afc298a417ff3270608", size = 270214 },
+    { url = "https://files.pythonhosted.org/packages/a1/58/0642d061d5de779f39c50cbb00df49682832923f3d2ebfb0fedf02d05f7f/frozenlist-1.5.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12f78f98c2f1c2429d42e6a485f433722b0061d5c0b0139efa64f396efb5886b", size = 273905 },
+    { url = "https://files.pythonhosted.org/packages/ab/66/3fe0f5f8f2add5b4ab7aa4e199f767fd3b55da26e3ca4ce2cc36698e50c4/frozenlist-1.5.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce3aa154c452d2467487765e3adc730a8c153af77ad84096bc19ce19a2400840", size = 250542 },
+    { url = "https://files.pythonhosted.org/packages/f6/b8/260791bde9198c87a465224e0e2bb62c4e716f5d198fc3a1dacc4895dbd1/frozenlist-1.5.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b7dc0c4338e6b8b091e8faf0db3168a37101943e687f373dce00959583f7439", size = 267026 },
+    { url = "https://files.pythonhosted.org/packages/2e/a4/3d24f88c527f08f8d44ade24eaee83b2627793fa62fa07cbb7ff7a2f7d42/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45e0896250900b5aa25180f9aec243e84e92ac84bd4a74d9ad4138ef3f5c97de", size = 257690 },
+    { url = "https://files.pythonhosted.org/packages/de/9a/d311d660420b2beeff3459b6626f2ab4fb236d07afbdac034a4371fe696e/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:561eb1c9579d495fddb6da8959fd2a1fca2c6d060d4113f5844b433fc02f2641", size = 253893 },
+    { url = "https://files.pythonhosted.org/packages/c6/23/e491aadc25b56eabd0f18c53bb19f3cdc6de30b2129ee0bc39cd387cd560/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:df6e2f325bfee1f49f81aaac97d2aa757c7646534a06f8f577ce184afe2f0a9e", size = 267006 },
+    { url = "https://files.pythonhosted.org/packages/08/c4/ab918ce636a35fb974d13d666dcbe03969592aeca6c3ab3835acff01f79c/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:140228863501b44b809fb39ec56b5d4071f4d0aa6d216c19cbb08b8c5a7eadb9", size = 276157 },
+    { url = "https://files.pythonhosted.org/packages/c0/29/3b7a0bbbbe5a34833ba26f686aabfe982924adbdcafdc294a7a129c31688/frozenlist-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7707a25d6a77f5d27ea7dc7d1fc608aa0a478193823f88511ef5e6b8a48f9d03", size = 264642 },
+    { url = "https://files.pythonhosted.org/packages/ab/42/0595b3dbffc2e82d7fe658c12d5a5bafcd7516c6bf2d1d1feb5387caa9c1/frozenlist-1.5.0-cp313-cp313-win32.whl", hash = "sha256:31a9ac2b38ab9b5a8933b693db4939764ad3f299fcaa931a3e605bc3460e693c", size = 44914 },
+    { url = "https://files.pythonhosted.org/packages/17/c4/b7db1206a3fea44bf3b838ca61deb6f74424a8a5db1dd53ecb21da669be6/frozenlist-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:11aabdd62b8b9c4b84081a3c246506d1cddd2dd93ff0ad53ede5defec7886b28", size = 51167 },
+    { url = "https://files.pythonhosted.org/packages/c6/c8/a5be5b7550c10858fcf9b0ea054baccab474da77d37f1e828ce043a3a5d4/frozenlist-1.5.0-py3-none-any.whl", hash = "sha256:d994863bba198a4a518b467bb971c56e1db3f180a25c6cf7bb1949c267f748c3", size = 11901 },
+]
+
+[[package]]
+name = "fsspec"
+version = "2024.12.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/11/de70dee31455c546fbc88301971ec03c328f3d1138cfba14263f651e9551/fsspec-2024.12.0.tar.gz", hash = "sha256:670700c977ed2fb51e0d9f9253177ed20cbde4a3e5c0283cc5385b5870c8533f", size = 291600 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/de/86/5486b0188d08aa643e127774a99bac51ffa6cf343e3deb0583956dca5b22/fsspec-2024.12.0-py3-none-any.whl", hash = "sha256:b520aed47ad9804237ff878b504267a3b0b441e97508bd6d2d8774e3db85cee2", size = 183862 },
+]
+
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp" },
 ]
 
 [package.optional-dependencies]