Merge branch 'main' into chroma

2025-12-03 18:00:36 +00:00 · 2025-09-19 22:53:03 +09:00 · 2025-09-19 22:53:03 +09:00 · c71bcd5479
commit c71bcd5479
parent aaea9fed12 4c2fcb6b51
124 changed files with 25574 additions and 2425 deletions
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -2,10 +2,10 @@ blank_issues_enabled: false

 contact_links:
  - name: Have you read the docs?
-    url: https://llama-stack.readthedocs.io/en/latest/index.html
+    url: https://llamastack.github.io/latest/providers/external/index.html
    about: Much help can be found in the docs
  - name: Start a discussion
-    url: https://github.com/meta-llama/llama-stack/discussions/new
+    url: https://github.com/llamastack/llama-stack/discussions/new/
    about: Start a discussion on a topic
  - name: Chat on Discord
    url: https://discord.gg/llama-stack
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -47,11 +47,21 @@ jobs:
        run: npm ci
        working-directory: llama_stack/ui

-      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      - name: Run pre-commit
+        id: precommit
+        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+        continue-on-error: true
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github

+      - name: Check pre-commit results
+        if: steps.precommit.outcome == 'failure'
+        run: |
+          echo "::error::Pre-commit hooks failed. Please run 'pre-commit run --all-files' locally and commit the fixes."
+          echo "::warning::Some pre-commit hooks failed. Check the output above for details."
+          exit 1
+
      - name: Debug
        run: |
          echo "github.ref: ${{ github.ref }}"
@ -79,17 +89,23 @@ jobs:
            echo "No changes to commit"
          fi

-      - name: Verify if there are any diff files after pre-commit
+      - name: Verify no uncommitted changes
        if: github.actor != 'dependabot[bot]'
        run: |
-          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
+          if ! git diff --exit-code; then
+            echo "::error::There are uncommitted changes after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
+            echo "::warning::Files with changes:"
+            git diff --name-status
+            exit 1
+          fi

      - name: Verify if there are any new files after pre-commit
        if: github.actor != 'dependabot[bot]'
        run: |
          unstaged_files=$(git ls-files --others --exclude-standard)
          if [ -n "$unstaged_files" ]; then
-            echo "There are uncommitted new files, run pre-commit locally and commit again"
+            echo "::error::There are new untracked files after pre-commit. Please run 'pre-commit run --all-files' locally and commit the fixes."
+            echo "::warning::New files:"
            echo "$unstaged_files"
            exit 1
          fi
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -187,7 +187,7 @@ Note that the provider "description" field will be used to generate the provider

 ### Building the Documentation

-If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
+If you are making changes to the documentation at [https://llamastack.github.io/latest/](https://llamastack.github.io/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.

 ```bash
 # This rebuilds the documentation pages.
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)

-[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+[**Quick Start**](https://llamastack.github.io/latest/getting_started/index.html) | [**Documentation**](https://llamastack.github.io/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)


 ### ✨🎉 Llama 4 Support  🎉✨
@ -109,7 +109,7 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on

 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
-Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/providers/index.html)
+Please checkout for [full list](https://llamastack.github.io/latest/providers/index.html)

 | API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
 |:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
@ -140,7 +140,7 @@ Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/pro
 |     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
 |        NVIDIA        | Hosted | | | | | | ✅ | ✅ | ✅ |

-> **Note**: Additional providers are available through external packages. See [External Providers](https://llama-stack.readthedocs.io/en/latest/providers/external.html) documentation.
+> **Note**: Additional providers are available through external packages. See [External Providers](https://llamastack.github.io/latest/providers/external/index.html) documentation.

 ### Distributions

@ -149,24 +149,24 @@ Here are some of the distributions we support:

 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
-|                Starter Distribution                 |           [llamastack/distribution-starter](https://hub.docker.com/repository/docker/llamastack/distribution-starter/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/starter.html)      |
-|                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
+|                Starter Distribution                 |           [llamastack/distribution-starter](https://hub.docker.com/repository/docker/llamastack/distribution-starter/general)           |      [Guide](https://llamastack.github.io/latest/distributions/self_hosted_distro/starter.html)      |
+|                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llamastack.github.io/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
 |                   PostgreSQL                  |                [llamastack/distribution-postgres-demo](https://hub.docker.com/repository/docker/llamastack/distribution-postgres-demo/general)                |                  |

 ### Documentation

-Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
+Please checkout our [Documentation](https://llamastack.github.io/latest/index.html) page for more details.

 * CLI references
-    * [llama (server-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
-    * [llama (client-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
+    * [llama (server-side) CLI Reference](https://llamastack.github.io/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
+    * [llama (client-side) CLI Reference](https://llamastack.github.io/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
 * Getting Started
-    * [Quick guide to start a Llama Stack server](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
+    * [Quick guide to start a Llama Stack server](https://llamastack.github.io/latest/getting_started/index.html).
    * [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
    * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
    * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
 * [Contributing](CONTRIBUTING.md)
-    * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.
+    * [Adding a new API Provider](https://llamastack.github.io/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.

 ### Llama Stack Client SDKs

--- a/benchmarking/k8s-benchmark/apply.sh
+++ b/benchmarking/k8s-benchmark/apply.sh
@ -17,11 +17,8 @@ export POSTGRES_PASSWORD=llamastack
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

-export MOCK_INFERENCE_MODEL=mock-inference
-
-export MOCK_INFERENCE_URL=openai-mock-service:8080
-
 export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
+export LLAMA_STACK_WORKERS=4

 set -euo pipefail
 set -x
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -5,6 +5,7 @@ data:
    image_name: kubernetes-benchmark-demo
    apis:
    - agents
+    - files
    - inference
    - files
    - safety
@ -23,6 +24,14 @@ data:
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
+      files:
+      - provider_id: meta-reference-files
+        provider_type: inline::localfs
+        config:
+          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+          metadata_store:
+            type: sqlite
+            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
      vector_io:
      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
        provider_type: remote::chromadb
--- a/benchmarking/k8s-benchmark/stack-k8s.yaml.template
+++ b/benchmarking/k8s-benchmark/stack-k8s.yaml.template
@ -52,9 +52,20 @@ spec:
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: VLLM_TLS_VERIFY
          value: "false"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
+        - name: LLAMA_STACK_LOGGING
+          value: "all=WARNING"
+        - name: LLAMA_STACK_CONFIG
+          value: "/etc/config/stack_run_config.yaml"
+        - name: LLAMA_STACK_WORKERS
+          value: "${LLAMA_STACK_WORKERS}"
+        command: ["uvicorn", "llama_stack.core.server.server:create_app", "--host", "0.0.0.0", "--port", "8323", "--workers", "$LLAMA_STACK_WORKERS", "--factory"]
        ports:
          - containerPort: 8323
+        resources:
+          requests:
+            cpu: "${LLAMA_STACK_WORKERS}"
+          limits:
+            cpu: "${LLAMA_STACK_WORKERS}"
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.llama
--- a/docs/README.md
+++ b/docs/README.md
@ -1,6 +1,6 @@
 # Llama Stack Documentation

-Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
+Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [Github page](https://llamastack.github.io/latest/getting_started/index.html).

 ## Render locally

--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -1380,6 +1380,40 @@
                        }
                    }
                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "Unregister a benchmark.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to unregister.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
            }
        },
        "/v1/openai/v1/chat/completions/{completion_id}": {
@ -1620,6 +1654,40 @@
                        }
                    }
                ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "ScoringFunctions"
+                ],
+                "description": "Unregister a scoring function.",
+                "parameters": [
+                    {
+                        "name": "scoring_fn_id",
+                        "in": "path",
+                        "description": "The ID of the scoring function to unregister.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
            }
        },
        "/v1/shields/{identifier}": {
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -954,6 +954,30 @@ paths:
          required: true
          schema:
            type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Benchmarks
+      description: Unregister a benchmark.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: The ID of the benchmark to unregister.
+          required: true
+          schema:
+            type: string
  /v1/openai/v1/chat/completions/{completion_id}:
    get:
      responses:
@ -1119,6 +1143,31 @@ paths:
          required: true
          schema:
            type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      description: Unregister a scoring function.
+      parameters:
+        - name: scoring_fn_id
+          in: path
+          description: >-
+            The ID of the scoring function to unregister.
+          required: true
+          schema:
+            type: string
  /v1/shields/{identifier}:
    get:
      responses:
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -11,11 +11,11 @@
        "\n",
        "# Llama Stack - Building AI Applications\n",
        "\n",
-        "<img src=\"https://llama-stack.readthedocs.io/en/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
+        "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
        "\n",
        "[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n",
        "\n",
-        "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
+        "Read more about the project here: https://llamastack.github.io/latest/getting_started/index.html\n",
        "\n",
        "In this guide, we will showcase how you can build LLM-powered agentic applications using Llama Stack.\n",
        "\n",
@ -75,7 +75,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 1,
+      "execution_count": null,
      "id": "J2kGed0R5PSf",
      "metadata": {
        "colab": {
@ -242,7 +242,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": null,
      "id": "E1UFuJC570Tk",
      "metadata": {
        "colab": {
@ -1177,7 +1177,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 13,
+      "execution_count": null,
      "id": "WS8Gu5b0APHs",
      "metadata": {
        "colab": {
@ -1249,7 +1249,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 14,
+      "execution_count": null,
      "id": "GvLWltzZCNkg",
      "metadata": {
        "colab": {
@ -2154,7 +2154,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 21,
+      "execution_count": null,
      "id": "vttLbj_YO01f",
      "metadata": {
        "colab": {
@ -2283,7 +2283,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": null,
      "id": "4iCO59kP20Zs",
      "metadata": {
        "colab": {
@ -2846,7 +2846,7 @@
    },
    {
      "cell_type": "code",
-      "execution_count": 29,
+      "execution_count": null,
      "id": "44e05e16",
      "metadata": {},
      "outputs": [
@ -2880,8 +2880,7 @@
        "!curl -O https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg\n",
        "\n",
        "from IPython.display import Image\n",
-        "Image(\"Llama_Repo.jpeg\", width=256, height=256)\n",
-        "\n"
+        "Image(\"Llama_Repo.jpeg\", width=256, height=256)\n"
      ]
    },
    {
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
@ -11,11 +11,11 @@
        "\n",
        "# Getting Started with Llama 4 in Llama Stack\n",
        "\n",
-        "<img src=\"https://llama-stack.readthedocs.io/en/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
+        "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
        "\n",
        "[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n",
        "\n",
-        "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
+        "Read more about the project here: https://llamastack.github.io/latest/index.html\n",
        "\n",
        "In this guide, we will showcase how you can get started with using Llama 4 in Llama Stack.\n",
        "\n",
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
@ -11,11 +11,11 @@
        "\n",
        "# Getting Started with Llama 4 in Llama Stack\n",
        "\n",
-          "<img src=\"https://llama-stack.readthedocs.io/en/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
+        "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
        "\n",
        "[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n",
        "\n",
-          "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
+        "Read more about the project here: https://llamastack.github.io/latest/\n",
        "\n",
        "In this guide, we will showcase how you can get started with using Llama 4 in Llama Stack.\n",
        "\n",
@ -332,7 +332,7 @@
    },
    {
      "cell_type": "code",
-        "execution_count": 3,
+      "execution_count": null,
      "id": "E1UFuJC570Tk",
      "metadata": {
        "colab": {
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
@ -14,7 +14,7 @@
        "We will also showcase how to leverage existing Llama stack [inference APIs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/apis/inference/inference.py) (ollama as provider) to get the new model's output and the [eval APIs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/apis/eval/eval.py) to help you better measure the new model performance. We hope the flywheel of post-training -> eval -> inference can greatly empower agentic apps development.\n",
        "\n",
        "\n",
-        "- Read more about Llama Stack: https://llama-stack.readthedocs.io/en/latest/introduction/index.html\n",
+        "- Read more about Llama Stack: https://llamastack.github.io/latest/index.html\n",
        "- Read more about post training APIs definition: https://github.com/meta-llama/llama-stack/blob/main/llama_stack/apis/post_training/post_training.py\n",
        "\n",
        "\n",
@ -3632,7 +3632,7 @@
      },
      "source": [
        "#### 1.2. Kick-off eval job\n",
-        "- More details on Llama-stack eval: https://llama-stack.readthedocs.io/en/latest/benchmark_evaluations/index.html\n",
+        "- More details on Llama-stack eval: https://llamastack.github.io/latest/references/evals_reference/index.html\n",
        "  - Define an EvalCandidate\n",
        "  - Run evaluate on datasets (we choose brainstrust's answer-similarity as scoring function with OpenAI's model as judge model)\n",
        "\n",
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -12,7 +12,7 @@
        "\n",
        "This notebook will walk you through the main sets of APIs we offer with Llama Stack for supporting running benchmark evaluations of your with working examples to explore the possibilities that Llama Stack opens up for you.\n",
        "\n",
-        "Read more about Llama Stack: https://llama-stack.readthedocs.io/en/latest/index.html"
+        "Read more about Llama Stack: https://llamastack.github.io/latest/index.html"
      ]
    },
    {
--- a/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
@ -373,7 +373,7 @@
    "    metadata={\n",
    "        \"format\": \"json\",\n",
    "        \"description\": \"Tool calling xLAM dataset in OpenAI ChatCompletions format\",\n",
-    "        \"provider\": \"nvidia\"\n",
+    "        \"provider_id\": \"nvidia\"\n",
    "    }\n",
    ")\n",
    "print(response)"
--- a/docs/quick_start.ipynb
+++ b/docs/quick_start.ipynb
@ -11,7 +11,7 @@
        "\n",
        "# Llama Stack - Building AI Applications\n",
        "\n",
-        "<img src=\"https://llama-stack.readthedocs.io/en/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
+        "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
        "\n",
        "Get started with Llama Stack in minutes!\n",
        "\n",
@ -150,7 +150,7 @@
        "def run_llama_stack_server_background():\n",
        "    log_file = open(\"llama_stack_server.log\", \"w\")\n",
        "    process = subprocess.Popen(\n",
-        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv",
+        "        f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv\n",
        "        shell=True,\n",
        "        stdout=log_file,\n",
        "        stderr=log_file,\n",
--- a/docs/source/apis/api_leveling.md
+++ b/docs/source/apis/api_leveling.md
@ -0,0 +1,94 @@
+# Llama Stack API Stability Leveling
+
+In order to provide a stable experience in Llama Stack, the various APIs need different stability levels indicating the level of support, backwards compatability, and overall production readiness.
+
+## Different Levels
+
+### v1alpha
+
+- Little to no expectation of support between versions
+- Breaking changes are permitted
+- Datatypes and parameters can break
+- Routes can be added and removed
+
+#### Graduation Criteria
+
+- an API can graduate from `v1alpha` to `v1beta` if the team has identified the extent of the non-optional routes and the shape of their parameters/return types for the API eg. `/v1/openai/chat/completions`. Optional types can change.
+- CRUD must stay stable once in `v1beta`. This is a commitment to backward compatibility, guaranteeing that most code you write against the v1beta version will not break during future updates. We may make additive changes (like adding a new, optional field to a response), but we will not make breaking changes (like renaming an existing "modelName" field to "name", changing an ID's data type from an integer to a string, or altering an endpoint URL).
+- for OpenAI APIs, a comparison to the OpenAI spec for the specific API can be done to ensure completeness.
+
+### v1beta
+
+- API routes remain consistent between versions
+- Parameters and return types are not ensured between versions
+- API, besides minor fixes and adjustments, should be _almost_ v1. Changes should not be drastic.
+
+#### Graduation Criteria
+
+- an API can graduate from `v1beta` to `v1` if the API surface and datatypes are complete as identified by the team. The parameters and return types that are mandatory for each route are stable. All aspects of graduating from `v1alpha1` to `v1beta` apply as well.
+- Optional parameters, routes, or parts of the return type can be added after graduating to `v1`
+
+### v1 (stable)
+
+- Considered stable
+- Backwards compatible between Z-streams
+  - Y-stream breaking changes must go through the proper approval and announcement process.
+- Datatypes for a route and its return types cannot change between Z-streams
+  - Y-stream datatype changes should be sparing, unless the changes are additional net-new parameters
+- Must have proper conformance testing as outlined in https://github.com/llamastack/llama-stack/issues/3237
+
+### v2+ (Major Versions)
+
+Introducing a new major version like `/v2` is a significant and disruptive event that should be treated as a last resort. It is reserved for essential changes to a stable `/v1` API that are fundamentally backward-incompatible and cannot be implemented through additive, non-breaking changes or breaking changes across X/Y-Stream releases (x.y.z).
+
+If a `/v2` version is deemed absolutely necessary, it must adhere to the following protocol to ensure a sane and predictable transition for users:
+
+#### Lifecycle Progression
+
+ A new major version must follow the same stability lifecycle as `/v1`. It will be introduced as `/v2alpha`, mature to `/v2beta`, and finally become stable as `/v2`.
+
+#### Coexistence:
+
+The new `/v2` API must be introduced alongside the existing `/v1` API and run in parallel. It must not replace the `/v1` API immediately.
+
+#### Deprecation Policy:
+
+When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.
+
+### API Stability vs. Provider Stability
+
+The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.
+
+Providers can iterate as much as they want on functionality as long as they work within the bounds of an API. If they need to change the API, then the API should not be `/v1`, or those breaking changes can only happen on a y-stream release basis.
+
+### Approval and Announcement Process for Breaking Changes
+
+- **PR Labeling**: Any pull request that introduces a breaking API change must be clearly labeled with `breaking-change`.
+- **PR Title/Commit**: Any pull request that introduces a breaking API change must contain `BREAKING CHANGE` in the title and commit footer. Alternatively, the commit can include `!`, eg. `feat(api)!: title goes here` This is outlined in the [conventional commits documentation](https://www.conventionalcommits.org/en/v1.0.0/#specification)
+- **Maintainer Review**: At least one maintainer must explicitly acknowledge the breaking change during review by applying the `breaking-change` label. An approval must come with this label or the acknowledgement this label has already been applied.
+- **Announcement**: Breaking changes require inclusion in release notes and, if applicable, a separate communication (e.g., Discord, Github Issues, or GitHub Discussions) prior to release.
+
+If a PR has proper approvals, labels, and commit/title hygiene, the failing API conformance tests will be bypassed.
+
+
+## Enforcement
+
+### Migration of API routes under `/v1alpha`, `/v1beta`, and `/v1`
+
+Instead of placing every API under `/v1`, any API that is not fully stable or complete should go under `/v1alpha` or `/v1beta`. For example, at the time of this writing,  `post_training` belongs here, as well as any OpenAI-compatible API whose surface does not exactly match the upstream OpenAI API it mimics.
+
+This migration is crucial as we get Llama Stack in the hands of users who intend to productize various APIs. A clear view of what is stable and what is actively being developed will enable users to pick and choose various APIs to build their products on.
+
+This migration will be a breaking change for any API moving out of `/v1`. Ideally, this should happen before 0.3.0 and especially 1.0.0.
+
+### `x-stability` tags in the OpenAPI spec for oasdiff
+
+`x-stability` tags allow tools like oasdiff to enforce different rules for different stability levels; these tags should match the routes: [oasdiff stability](https://github.com/oasdiff/oasdiff/blob/main/docs/STABILITY.md)
+
+### Testing
+
+The testing of each stable API is already outlined in [issue #3237](https://github.com/llamastack/llama-stack/issues/3237) and is being worked on. These sorts of conformance tests should apply primarily to `/v1` APIs only, with `/v1alpha` and `/v1beta` having any tests the maintainers see fit as well as basic testing to ensure the routing works properly.
+
+### New APIs going forward
+
+Any subsequently introduced APIs should be introduced as `/v1alpha`
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -11,6 +11,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
 | agents | `inline::meta-reference` |
 | datasetio | `inline::localfs`, `remote::nvidia` |
 | eval | `remote::nvidia` |
+| files | `inline::localfs` |
 | inference | `remote::nvidia` |
 | post_training | `remote::nvidia` |
 | safety | `remote::nvidia` |
--- a/docs/source/providers/inference/remote_watsonx.md
+++ b/docs/source/providers/inference/remote_watsonx.md
@ -9,8 +9,8 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The watsonx API key, only needed of using the hosted service |
-| `project_id` | `str \| None` | No |  | The Project ID key, only needed of using the hosted service |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | The watsonx API key |
+| `project_id` | `str \| None` | No |  | The Project ID key |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |

 ## Sample Configuration
--- a/docs/zero_to_hero_guide/00_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/00_Inference101.ipynb
@ -9,7 +9,7 @@
        "\n",
        "This document provides instructions on how to use Llama Stack's `chat_completion` function for generating text using the `Llama3.2-3B-Instruct` model. \n",
        "\n",
-        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "\n",
        "### Table of Contents\n",
--- a/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
+++ b/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb
@ -10,7 +10,7 @@
    "This guide provides a streamlined setup to switch between local and cloud clients for text generation with Llama Stack’s `chat_completion` API. This setup enables automatic fallback to a cloud instance if the local client is unavailable.\n",
    "\n",
    "### Prerequisites\n",
-    "Before you begin, please ensure Llama Stack is installed and the distribution is set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/). You will need to run two distributions, a local and a cloud distribution, for this demo to work.\n",
+    "Before you begin, please ensure Llama Stack is installed and the distribution is set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html). You will need to run two distributions, a local and a cloud distribution, for this demo to work.\n",
    "\n",
    "### Implementation"
   ]
--- a/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
+++ b/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb
@ -11,7 +11,7 @@
        "\n",
        "This interactive guide covers prompt engineering & best practices with Llama 3.2 and Llama Stack.\n",
        "\n",
-        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)."
+        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html)."
      ]
    },
    {
--- a/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
+++ b/docs/zero_to_hero_guide/03_Image_Chat101.ipynb
@ -7,7 +7,7 @@
      "source": [
        "## Getting Started with LlamaStack Vision API\n",
        "\n",
-        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "Let's import the necessary packages"
      ]
--- a/docs/zero_to_hero_guide/05_Memory101.ipynb
+++ b/docs/zero_to_hero_guide/05_Memory101.ipynb
@ -26,7 +26,7 @@
        "A running instance of the Llama Stack server (we'll use localhost in \n",
        "this tutorial)\n",
        "\n",
-        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "Let's start by installing the required packages:"
      ]
--- a/docs/zero_to_hero_guide/06_Safety101.ipynb
+++ b/docs/zero_to_hero_guide/06_Safety101.ipynb
@ -6,7 +6,7 @@
      "source": [
        "## Safety API 101\n",
        "\n",
-        "This document talks about the Safety APIs in Llama Stack. Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "This document talks about the Safety APIs in Llama Stack. Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "As outlined in our [Responsible Use Guide](https://www.llama.com/docs/how-to-guides/responsible-use-guide-resources/), LLM apps should deploy appropriate system level safeguards to mitigate safety and security risks of LLM system, similar to the following diagram:\n",
        "\n",
--- a/docs/zero_to_hero_guide/07_Agents101.ipynb
+++ b/docs/zero_to_hero_guide/07_Agents101.ipynb
@ -6,7 +6,7 @@
      "source": [
        "## Agentic API 101\n",
        "\n",
-        "This document talks about the Agentic APIs in Llama Stack. Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).\n",
+        "This document talks about the Agentic APIs in Llama Stack. Before you begin, please ensure Llama Stack is installed and set up by following the [Getting Started Guide](https://llamastack.github.io/latest/getting_started/index.html).\n",
        "\n",
        "Starting Llama 3.1 you can build agentic applications capable of:\n",
        "\n",
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -9,13 +9,18 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
 > If you'd prefer not to set up a local server, explore our notebook on [tool calling with the Together API](Tool_Calling101_Using_Together_Llama_Stack_Server.ipynb). This notebook will show you how to leverage together.ai's Llama Stack Server API, allowing you to get started with Llama Stack without the need for a locally built and running server.

 ## Table of Contents
-1. [Setup and run ollama](#setup-ollama)
-2. [Install Dependencies and Set Up Environment](#install-dependencies-and-set-up-environment)
-3. [Build, Configure, and Run Llama Stack](#build-configure-and-run-llama-stack)
-4. [Test with llama-stack-client CLI](#test-with-llama-stack-client-cli)
-5. [Test with curl](#test-with-curl)
-6. [Test with Python](#test-with-python)
-7. [Next Steps](#next-steps)
+- [Llama Stack: from Zero to Hero](#llama-stack-from-zero-to-hero)
+  - [Table of Contents](#table-of-contents)
+  - [Setup ollama](#setup-ollama)
+  - [Install Dependencies and Set Up Environment](#install-dependencies-and-set-up-environment)
+  - [Build, Configure, and Run Llama Stack](#build-configure-and-run-llama-stack)
+  - [Test with `llama-stack-client` CLI](#test-with-llama-stack-client-cli)
+  - [Test with `curl`](#test-with-curl)
+  - [Test with Python](#test-with-python)
+    - [1. Create Python Script (`test_llama_stack.py`)](#1-create-python-script-test_llama_stackpy)
+    - [2. Create a Chat Completion Request in Python](#2-create-a-chat-completion-request-in-python)
+    - [3. Run the Python Script](#3-run-the-python-script)
+  - [Next Steps](#next-steps)

 ---

@ -242,7 +247,7 @@ This command initializes the model to interact with your local Llama Stack insta
 ## Next Steps

 **Explore Other Guides**: Dive deeper into specific topics by following these guides:
- [Understanding Distribution](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions)
+- [Understanding Distribution](https://llamastack.github.io/latest/concepts/index.html#distributions)
 - [Inference 101](00_Inference101.ipynb)
 - [Local and Cloud Model Toggling 101](01_Local_Cloud_Inference101.ipynb)
 - [Prompt Engineering](02_Prompt_Engineering101.ipynb)
@ -259,7 +264,7 @@ This command initializes the model to interact with your local Llama Stack insta
  - [Swift SDK](https://github.com/meta-llama/llama-stack-client-swift)
  - [Kotlin SDK](https://github.com/meta-llama/llama-stack-client-kotlin)

-**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) guide.
+**Advanced Configuration**: Learn how to customize your Llama Stack distribution by referring to the [Building a Llama Stack Distribution](https://llamastack.github.io/latest/distributions/building_distro.html) guide.

 **Explore Example Apps**: Check out [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) for example applications built using Llama Stack.

--- a/llama_stack/apis/benchmarks/benchmarks.py
+++ b/llama_stack/apis/benchmarks/benchmarks.py
@ -93,3 +93,11 @@ class Benchmarks(Protocol):
        :param metadata: The metadata to use for the benchmark.
        """
        ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE")
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        """Unregister a benchmark.
+
+        :param benchmark_id: The ID of the benchmark to unregister.
+        """
+        ...
--- a/llama_stack/apis/scoring_functions/scoring_functions.py
+++ b/llama_stack/apis/scoring_functions/scoring_functions.py
@ -197,3 +197,11 @@ class ScoringFunctions(Protocol):
        :param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
        """
        ...
+
+    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE")
+    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+        """Unregister a scoring function.
+
+        :param scoring_fn_id: The ID of the scoring function to unregister.
+        """
+        ...
--- a/llama_stack/cli/verify_download.py
+++ b/llama_stack/cli/verify_download.py
@ -48,15 +48,12 @@ def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None:
    parser.set_defaults(func=partial(run_verify_cmd, parser=parser))


-def calculate_md5(filepath: Path, chunk_size: int = 8192) -> str:
-    # NOTE: MD5 is used here only for download integrity verification,
-    # not for security purposes
-    # TODO: switch to SHA256
-    md5_hash = hashlib.md5(usedforsecurity=False)
+def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str:
+    sha256_hash = hashlib.sha256()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
-            md5_hash.update(chunk)
-    return md5_hash.hexdigest()
+            sha256_hash.update(chunk)
+    return sha256_hash.hexdigest()


 def load_checksums(checklist_path: Path) -> dict[str, str]:
@ -64,10 +61,10 @@ def load_checksums(checklist_path: Path) -> dict[str, str]:
    with open(checklist_path) as f:
        for line in f:
            if line.strip():
-                md5sum, filepath = line.strip().split("  ", 1)
+                sha256sum, filepath = line.strip().split("  ", 1)
                # Remove leading './' if present
                filepath = filepath.lstrip("./")
-                checksums[filepath] = md5sum
+                checksums[filepath] = sha256sum
    return checksums


@ -88,7 +85,7 @@ def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -
            matches = False

            if exists:
-                actual_hash = calculate_md5(full_path)
+                actual_hash = calculate_sha256(full_path)
                matches = actual_hash == expected_hash

            results.append(
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@ -121,10 +121,6 @@ class AutoRoutedProviderSpec(ProviderSpec):
        default=None,
    )

-    @property
-    def pip_packages(self) -> list[str]:
-        raise AssertionError("Should not be called on AutoRoutedProviderSpec")
-

 # Example: /models, /shields
 class RoutingTableProviderSpec(ProviderSpec):
--- a/llama_stack/core/distribution.py
+++ b/llama_stack/core/distribution.py
@ -16,16 +16,18 @@ from llama_stack.core.datatypes import BuildConfig, DistributionSpec
 from llama_stack.core.external import load_external_apis
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import (
-    AdapterSpec,
    Api,
    InlineProviderSpec,
    ProviderSpec,
-    remote_provider_spec,
+    RemoteProviderSpec,
 )

 logger = get_logger(name=__name__, category="core")


+INTERNAL_APIS = {Api.inspect, Api.providers, Api.prompts}
+
+
 def stack_apis() -> list[Api]:
    return list(Api)

@ -70,31 +72,16 @@ def builtin_automatically_routed_apis() -> list[AutoRoutedApiInfo]:

 def providable_apis() -> list[Api]:
    routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
-    return [api for api in Api if api not in routing_table_apis and api != Api.inspect and api != Api.providers]
+    return [api for api in Api if api not in routing_table_apis and api not in INTERNAL_APIS]


 def _load_remote_provider_spec(spec_data: dict[str, Any], api: Api) -> ProviderSpec:
-    adapter = AdapterSpec(**spec_data["adapter"])
-    spec = remote_provider_spec(
-        api=api,
-        adapter=adapter,
-        api_dependencies=[Api(dep) for dep in spec_data.get("api_dependencies", [])],
-    )
+    spec = RemoteProviderSpec(api=api, provider_type=f"remote::{spec_data['adapter_type']}", **spec_data)
    return spec


 def _load_inline_provider_spec(spec_data: dict[str, Any], api: Api, provider_name: str) -> ProviderSpec:
-    spec = InlineProviderSpec(
-        api=api,
-        provider_type=f"inline::{provider_name}",
-        pip_packages=spec_data.get("pip_packages", []),
-        module=spec_data["module"],
-        config_class=spec_data["config_class"],
-        api_dependencies=[Api(dep) for dep in spec_data.get("api_dependencies", [])],
-        optional_api_dependencies=[Api(dep) for dep in spec_data.get("optional_api_dependencies", [])],
-        provider_data_validator=spec_data.get("provider_data_validator"),
-        container_image=spec_data.get("container_image"),
-    )
+    spec = InlineProviderSpec(api=api, provider_type=f"inline::{provider_name}", **spec_data)
    return spec


--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -40,7 +40,7 @@ from llama_stack.core.request_headers import (
 from llama_stack.core.resolver import ProviderRegistry
 from llama_stack.core.server.routes import RouteImpls, find_matching_route, initialize_route_impls
 from llama_stack.core.stack import (
-    construct_stack,
+    Stack,
    get_stack_run_config_from_distro,
    replace_env_vars,
 )
@ -252,7 +252,10 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):

        try:
            self.route_impls = None
-            self.impls = await construct_stack(self.config, self.custom_provider_registry)
+
+            stack = Stack(self.config, self.custom_provider_registry)
+            await stack.initialize()
+            self.impls = stack.impls
        except ModuleNotFoundError as _e:
            cprint(_e.msg, color="red", file=sys.stderr)
            cprint(
@ -289,6 +292,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
            )
            raise _e

+        assert self.impls is not None
        if Api.telemetry in self.impls:
            setup_logger(self.impls[Api.telemetry])

--- a/llama_stack/core/routing_tables/benchmarks.py
+++ b/llama_stack/core/routing_tables/benchmarks.py
@ -56,3 +56,7 @@ class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
            provider_resource_id=provider_benchmark_id,
        )
        await self.register_object(benchmark)
+
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        existing_benchmark = await self.get_benchmark(benchmark_id)
+        await self.unregister_object(existing_benchmark)
--- a/llama_stack/core/routing_tables/common.py
+++ b/llama_stack/core/routing_tables/common.py
@ -64,6 +64,10 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
        return await p.unregister_shield(obj.identifier)
    elif api == Api.datasetio:
        return await p.unregister_dataset(obj.identifier)
+    elif api == Api.eval:
+        return await p.unregister_benchmark(obj.identifier)
+    elif api == Api.scoring:
+        return await p.unregister_scoring_function(obj.identifier)
    elif api == Api.tool_runtime:
        return await p.unregister_toolgroup(obj.identifier)
    else:
--- a/llama_stack/core/routing_tables/scoring_functions.py
+++ b/llama_stack/core/routing_tables/scoring_functions.py
@ -60,3 +60,7 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
        )
        scoring_fn.provider_id = provider_id
        await self.register_object(scoring_fn)
+
+    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+        existing_scoring_fn = await self.get_scoring_function(scoring_fn_id)
+        await self.unregister_object(existing_scoring_fn)
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -6,6 +6,7 @@

 import argparse
 import asyncio
+import concurrent.futures
 import functools
 import inspect
 import json
@ -50,17 +51,15 @@ from llama_stack.core.request_headers import (
    request_provider_data_context,
    user_from_scope,
 )
-from llama_stack.core.resolver import InvalidProviderError
 from llama_stack.core.server.routes import (
    find_matching_route,
    get_all_api_routes,
    initialize_route_impls,
 )
 from llama_stack.core.stack import (
+    Stack,
    cast_image_name_to_string,
-    construct_stack,
    replace_env_vars,
-    shutdown_stack,
    validate_env_pair,
 )
 from llama_stack.core.utils.config import redact_sensitive_fields
@ -156,21 +155,34 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
        )


-async def shutdown(app):
-    """Initiate a graceful shutdown of the application.
-
-    Handled by the lifespan context manager. The shutdown process involves
-    shutting down all implementations registered in the application.
+class StackApp(FastAPI):
    """
-    await shutdown_stack(app.__llama_stack_impls__)
+    A wrapper around the FastAPI application to hold a reference to the Stack instance so that we can
+    start background tasks (e.g. refresh model registry periodically) from the lifespan context manager.
+    """
+
+    def __init__(self, config: StackRunConfig, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.stack: Stack = Stack(config)
+
+        # This code is called from a running event loop managed by uvicorn so we cannot simply call
+        # asyncio.run() to initialize the stack. We cannot await either since this is not an async
+        # function.
+        # As a workaround, we use a thread pool executor to run the initialize() method
+        # in a separate thread.
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future = executor.submit(asyncio.run, self.stack.initialize())
+            future.result()


@asynccontextmanager
-async def lifespan(app: FastAPI):
+async def lifespan(app: StackApp):
    logger.info("Starting up")
+    assert app.stack is not None
+    app.stack.create_registry_refresh_task()
    yield
    logger.info("Shutting down")
-    await shutdown(app)
+    await app.stack.shutdown()


 def is_streaming_request(func_name: str, request: Request, **kwargs):
@ -386,73 +398,61 @@ class ClientVersionMiddleware:
        return await self.app(scope, receive, send)


-def main(args: argparse.Namespace | None = None):
-    """Start the LlamaStack server."""
-    parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
+def create_app(
+    config_file: str | None = None,
+    env_vars: list[str] | None = None,
+) -> StackApp:
+    """Create and configure the FastAPI application.

-    add_config_distro_args(parser)
-    parser.add_argument(
-        "--port",
-        type=int,
-        default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
-        help="Port to listen on",
-    )
-    parser.add_argument(
-        "--env",
-        action="append",
-        help="Environment variables in KEY=value format. Can be specified multiple times.",
-    )
+    Args:
+        config_file: Path to config file. If None, uses LLAMA_STACK_CONFIG env var or default resolution.
+        env_vars: List of environment variables in KEY=value format.
+        disable_version_check: Whether to disable version checking. If None, uses LLAMA_STACK_DISABLE_VERSION_CHECK env var.

-    # Determine whether the server args are being passed by the "run" command, if this is the case
-    # the args will be passed as a Namespace object to the main function, otherwise they will be
-    # parsed from the command line
-    if args is None:
-        args = parser.parse_args()
+    Returns:
+        Configured StackApp instance.
+    """
+    config_file = config_file or os.getenv("LLAMA_STACK_CONFIG")
+    if config_file is None:
+        raise ValueError("No config file provided and LLAMA_STACK_CONFIG env var is not set")

-    config_or_distro = get_config_from_args(args)
-    config_file = resolve_config_or_distro(config_or_distro, Mode.RUN)
+    config_file = resolve_config_or_distro(config_file, Mode.RUN)

+    # Load and process configuration
    logger_config = None
    with open(config_file) as fp:
        config_contents = yaml.safe_load(fp)
        if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
            logger_config = LoggingConfig(**cfg)
        logger = get_logger(name=__name__, category="core::server", config=logger_config)
-        if args.env:
-            for env_pair in args.env:
+
+        if env_vars:
+            for env_pair in env_vars:
                try:
                    key, value = validate_env_pair(env_pair)
-                    logger.info(f"Setting CLI environment variable {key} => {value}")
+                    logger.info(f"Setting environment variable {key} => {value}")
                    os.environ[key] = value
                except ValueError as e:
                    logger.error(f"Error: {str(e)}")
-                    sys.exit(1)
+                    raise ValueError(f"Invalid environment variable format: {env_pair}") from e
+
        config = replace_env_vars(config_contents)
        config = StackRunConfig(**cast_image_name_to_string(config))

    _log_run_config(run_config=config)

-    app = FastAPI(
+    app = StackApp(
        lifespan=lifespan,
        docs_url="/docs",
        redoc_url="/redoc",
        openapi_url="/openapi.json",
+        config=config,
    )

    if not os.environ.get("LLAMA_STACK_DISABLE_VERSION_CHECK"):
        app.add_middleware(ClientVersionMiddleware)

-    try:
-        # Create and set the event loop that will be used for both construction and server runtime
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-
-        # Construct the stack in the persistent event loop
-        impls = loop.run_until_complete(construct_stack(config))
-
-    except InvalidProviderError as e:
-        logger.error(f"Error: {str(e)}")
-        sys.exit(1)
+    impls = app.stack.impls

    if config.server.auth:
        logger.info(f"Enabling authentication with provider: {config.server.auth.provider_config.type.value}")
@ -553,9 +553,54 @@ def main(args: argparse.Namespace | None = None):
    app.exception_handler(RequestValidationError)(global_exception_handler)
    app.exception_handler(Exception)(global_exception_handler)

-    app.__llama_stack_impls__ = impls
    app.add_middleware(TracingMiddleware, impls=impls, external_apis=external_apis)

+    return app
+
+
+def main(args: argparse.Namespace | None = None):
+    """Start the LlamaStack server."""
+    parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
+
+    add_config_distro_args(parser)
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
+        help="Port to listen on",
+    )
+    parser.add_argument(
+        "--env",
+        action="append",
+        help="Environment variables in KEY=value format. Can be specified multiple times.",
+    )
+
+    # Determine whether the server args are being passed by the "run" command, if this is the case
+    # the args will be passed as a Namespace object to the main function, otherwise they will be
+    # parsed from the command line
+    if args is None:
+        args = parser.parse_args()
+
+    config_or_distro = get_config_from_args(args)
+
+    try:
+        app = create_app(
+            config_file=config_or_distro,
+            env_vars=args.env,
+        )
+    except Exception as e:
+        logger.error(f"Error creating app: {str(e)}")
+        sys.exit(1)
+
+    config_file = resolve_config_or_distro(config_or_distro, Mode.RUN)
+    with open(config_file) as fp:
+        config_contents = yaml.safe_load(fp)
+        if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
+            logger_config = LoggingConfig(**cfg)
+        else:
+            logger_config = None
+        config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
+
    import uvicorn

    # Configure SSL if certificates are provided
@ -593,7 +638,6 @@ def main(args: argparse.Namespace | None = None):
    if ssl_config:
        uvicorn_config.update(ssl_config)

-    # Run uvicorn in the existing event loop to preserve background tasks
    # We need to catch KeyboardInterrupt because uvicorn's signal handling
    # re-raises SIGINT signals using signal.raise_signal(), which Python
    # converts to KeyboardInterrupt. Without this catch, we'd get a confusing
@ -604,13 +648,9 @@ def main(args: argparse.Namespace | None = None):
    # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
    # signal handling but this is quite intrusive and not worth the effort.
    try:
-        loop.run_until_complete(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
+        asyncio.run(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
    except (KeyboardInterrupt, SystemExit):
        logger.info("Received interrupt signal, shutting down gracefully...")
-    finally:
-        if not loop.is_closed():
-            logger.debug("Closing event loop")
-            loop.close()


 def _log_run_config(run_config: StackRunConfig):
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@ -315,11 +315,15 @@ def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConf
    impls[Api.prompts] = prompts_impl


+class Stack:
+    def __init__(self, run_config: StackRunConfig, provider_registry: ProviderRegistry | None = None):
+        self.run_config = run_config
+        self.provider_registry = provider_registry
+        self.impls = None
+
    # Produces a stack of providers for the given run config. Not all APIs may be
    # asked for in the run config.
-async def construct_stack(
-    run_config: StackRunConfig, provider_registry: ProviderRegistry | None = None
-) -> dict[Api, Any]:
+    async def initialize(self):
        if "LLAMA_STACK_TEST_INFERENCE_MODE" in os.environ:
            from llama_stack.testing.inference_recorder import setup_inference_recording

@ -329,24 +333,28 @@ async def construct_stack(
                TEST_RECORDING_CONTEXT.__enter__()
                logger.info(f"Inference recording enabled: mode={os.environ.get('LLAMA_STACK_TEST_INFERENCE_MODE')}")

-    dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name)
-    policy = run_config.server.auth.access_policy if run_config.server.auth else []
+        dist_registry, _ = await create_dist_registry(self.run_config.metadata_store, self.run_config.image_name)
+        policy = self.run_config.server.auth.access_policy if self.run_config.server.auth else []
        impls = await resolve_impls(
-        run_config, provider_registry or get_provider_registry(run_config), dist_registry, policy
+            self.run_config, self.provider_registry or get_provider_registry(self.run_config), dist_registry, policy
        )

        # Add internal implementations after all other providers are resolved
-    add_internal_implementations(impls, run_config)
+        add_internal_implementations(impls, self.run_config)

        if Api.prompts in impls:
            await impls[Api.prompts].initialize()

-    await register_resources(run_config, impls)
+        await register_resources(self.run_config, impls)

        await refresh_registry_once(impls)
+        self.impls = impls
+
+    def create_registry_refresh_task(self):
+        assert self.impls is not None, "Must call initialize() before starting"

        global REGISTRY_REFRESH_TASK
-    REGISTRY_REFRESH_TASK = asyncio.create_task(refresh_registry_task(impls))
+        REGISTRY_REFRESH_TASK = asyncio.create_task(refresh_registry_task(self.impls))

        def cb(task):
            import traceback
@ -360,11 +368,9 @@ async def construct_stack(
                logger.debug("Model refresh task completed")

        REGISTRY_REFRESH_TASK.add_done_callback(cb)
-    return impls

-
-async def shutdown_stack(impls: dict[Api, Any]):
-    for impl in impls.values():
+    async def shutdown(self):
+        for impl in self.impls.values():
            impl_name = impl.__class__.__name__
            logger.info(f"Shutting down {impl_name}")
            try:
--- a/llama_stack/core/start_stack.sh
+++ b/llama_stack/core/start_stack.sh
@ -123,6 +123,6 @@ if [[ "$env_type" == "venv" ]]; then
    $other_args
 elif [[ "$env_type" == "container" ]]; then
    echo -e "${RED}Warning: Llama Stack no longer supports running Containers via the 'llama stack run' command.${NC}"
-    echo -e "Please refer to the documentation for more information: https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html#llama-stack-build"
+    echo -e "Please refer to the documentation for more information: https://llamastack.github.io/latest/distributions/building_distro.html#llama-stack-build"
    exit 1
 fi
--- a/llama_stack/core/store/registry.py
+++ b/llama_stack/core/store/registry.py
@ -96,9 +96,11 @@ class DiskDistributionRegistry(DistributionRegistry):

    async def register(self, obj: RoutableObjectWithProvider) -> bool:
        existing_obj = await self.get(obj.type, obj.identifier)
-        # dont register if the object's providerid already exists
-        if existing_obj and existing_obj.provider_id == obj.provider_id:
-            return False
+        # warn if the object's providerid is different but proceed with registration
+        if existing_obj and existing_obj.provider_id != obj.provider_id:
+            logger.warning(
+                f"Object {existing_obj.type}:{existing_obj.identifier}'s {existing_obj.provider_id} provider is being replaced with {obj.provider_id}"
+            )

        await self.kvstore.set(
            KEY_FORMAT.format(type=obj.type, identifier=obj.identifier),
--- a/llama_stack/core/ui/README.md
+++ b/llama_stack/core/ui/README.md
@ -6,7 +6,7 @@

 ## Developer Setup

-1. Start up Llama Stack API server. More details [here](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
+1. Start up Llama Stack API server. More details [here](https://llamastack.github.io/latest/getting_started/index.htmll).

 ```
 llama stack build --distro together --image-type venv
--- a/llama_stack/distributions/nvidia/build.yaml
+++ b/llama_stack/distributions/nvidia/build.yaml
@ -23,6 +23,8 @@ distribution_spec:
    - provider_type: inline::basic
    tool_runtime:
    - provider_type: inline::rag-runtime
+    files:
+    - provider_type: inline::localfs
 image_type: venv
 additional_pip_packages:
 - aiosqlite
--- a/llama_stack/distributions/nvidia/nvidia.py
+++ b/llama_stack/distributions/nvidia/nvidia.py
@ -8,6 +8,7 @@ from pathlib import Path

 from llama_stack.core.datatypes import BuildProvider, ModelInput, Provider, ShieldInput, ToolGroupInput
 from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings, get_model_registry
+from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
 from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig
 from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
 from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
@ -15,7 +16,7 @@ from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES
 from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig


-def get_distribution_template() -> DistributionTemplate:
+def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
    providers = {
        "inference": [BuildProvider(provider_type="remote::nvidia")],
        "vector_io": [BuildProvider(provider_type="inline::faiss")],
@ -30,6 +31,7 @@ def get_distribution_template() -> DistributionTemplate:
        ],
        "scoring": [BuildProvider(provider_type="inline::basic")],
        "tool_runtime": [BuildProvider(provider_type="inline::rag-runtime")],
+        "files": [BuildProvider(provider_type="inline::localfs")],
    }

    inference_provider = Provider(
@ -52,6 +54,11 @@ def get_distribution_template() -> DistributionTemplate:
        provider_type="remote::nvidia",
        config=NVIDIAEvalConfig.sample_run_config(),
    )
+    files_provider = Provider(
+        provider_id="meta-reference-files",
+        provider_type="inline::localfs",
+        config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
    inference_model = ModelInput(
        model_id="${env.INFERENCE_MODEL}",
        provider_id="nvidia",
@ -73,7 +80,7 @@ def get_distribution_template() -> DistributionTemplate:

    default_models, _ = get_model_registry(available_models)
    return DistributionTemplate(
-        name="nvidia",
+        name=name,
        distro_type="self_hosted",
        description="Use NVIDIA NIM for running LLM inference, evaluation and safety",
        container_image=None,
@ -86,6 +93,7 @@ def get_distribution_template() -> DistributionTemplate:
                    "inference": [inference_provider],
                    "datasetio": [datasetio_provider],
                    "eval": [eval_provider],
+                    "files": [files_provider],
                },
                default_models=default_models,
                default_tool_groups=default_tool_groups,
@ -97,6 +105,7 @@ def get_distribution_template() -> DistributionTemplate:
                        safety_provider,
                    ],
                    "eval": [eval_provider],
+                    "files": [files_provider],
                },
                default_models=[inference_model, safety_model],
                default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")],
--- a/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/llama_stack/distributions/nvidia/run-with-safety.yaml
@ -4,6 +4,7 @@ apis:
 - agents
 - datasetio
 - eval
+- files
 - inference
 - post_training
 - safety
@ -88,6 +89,14 @@ providers:
  tool_runtime:
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/nvidia/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/files_metadata.db
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/registry.db
--- a/llama_stack/distributions/nvidia/run.yaml
+++ b/llama_stack/distributions/nvidia/run.yaml
@ -4,6 +4,7 @@ apis:
 - agents
 - datasetio
 - eval
+- files
 - inference
 - post_training
 - safety
@ -77,6 +78,14 @@ providers:
  tool_runtime:
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/nvidia/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/files_metadata.db
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/registry.db
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@ -78,12 +78,12 @@ def get_remote_inference_providers() -> list[Provider]:
    remote_providers = [
        provider
        for provider in available_providers()
-        if isinstance(provider, RemoteProviderSpec) and provider.adapter.adapter_type in ENABLED_INFERENCE_PROVIDERS
+        if isinstance(provider, RemoteProviderSpec) and provider.adapter_type in ENABLED_INFERENCE_PROVIDERS
    ]

    inference_providers = []
    for provider_spec in remote_providers:
-        provider_type = provider_spec.adapter.adapter_type
+        provider_type = provider_spec.adapter_type

        if provider_type in INFERENCE_PROVIDER_IDS:
            provider_id = INFERENCE_PROVIDER_IDS[provider_type]
--- a/llama_stack/distributions/watsonx/run.yaml
+++ b/llama_stack/distributions/watsonx/run.yaml
@ -10,6 +10,7 @@ apis:
 - telemetry
 - tool_runtime
 - vector_io
+- files
 providers:
  inference:
  - provider_id: watsonx
@ -94,6 +95,14 @@ providers:
    provider_type: inline::rag-runtime
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/watsonx/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/watsonx}/files_metadata.db
 metadata_store:
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/watsonx}/registry.db
--- a/llama_stack/distributions/watsonx/watsonx.py
+++ b/llama_stack/distributions/watsonx/watsonx.py
@ -9,6 +9,7 @@ from pathlib import Path
 from llama_stack.apis.models import ModelType
 from llama_stack.core.datatypes import BuildProvider, ModelInput, Provider, ToolGroupInput
 from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings, get_model_registry
+from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
 from llama_stack.providers.inline.inference.sentence_transformers import (
    SentenceTransformersInferenceConfig,
 )
@ -16,7 +17,7 @@ from llama_stack.providers.remote.inference.watsonx import WatsonXConfig
 from llama_stack.providers.remote.inference.watsonx.models import MODEL_ENTRIES


-def get_distribution_template() -> DistributionTemplate:
+def get_distribution_template(name: str = "watsonx") -> DistributionTemplate:
    providers = {
        "inference": [
            BuildProvider(provider_type="remote::watsonx"),
@ -42,6 +43,7 @@ def get_distribution_template() -> DistributionTemplate:
            BuildProvider(provider_type="inline::rag-runtime"),
            BuildProvider(provider_type="remote::model-context-protocol"),
        ],
+        "files": [BuildProvider(provider_type="inline::localfs")],
    }

    inference_provider = Provider(
@ -79,9 +81,14 @@ def get_distribution_template() -> DistributionTemplate:
        },
    )

+    files_provider = Provider(
+        provider_id="meta-reference-files",
+        provider_type="inline::localfs",
+        config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
+    )
    default_models, _ = get_model_registry(available_models)
    return DistributionTemplate(
-        name="watsonx",
+        name=name,
        distro_type="remote_hosted",
        description="Use watsonx for running LLM inference",
        container_image=None,
@ -92,6 +99,7 @@ def get_distribution_template() -> DistributionTemplate:
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider, embedding_provider],
+                    "files": [files_provider],
                },
                default_models=default_models + [embedding_model],
                default_tool_groups=default_tool_groups,
--- a/llama_stack/providers/datatypes.py
+++ b/llama_stack/providers/datatypes.py
@ -131,6 +131,15 @@ class ProviderSpec(BaseModel):
 """,
    )

+    pip_packages: list[str] = Field(
+        default_factory=list,
+        description="The pip dependencies needed for this implementation",
+    )
+
+    provider_data_validator: str | None = Field(
+        default=None,
+    )
+
    is_external: bool = Field(default=False, description="Notes whether this provider is an external provider.")

    # used internally by the resolver; this is a hack for now
@ -145,45 +154,8 @@ class RoutingTable(Protocol):
    async def get_provider_impl(self, routing_key: str) -> Any: ...


-# TODO: this can now be inlined into RemoteProviderSpec
-@json_schema_type
-class AdapterSpec(BaseModel):
-    adapter_type: str = Field(
-        ...,
-        description="Unique identifier for this adapter",
-    )
-    module: str = Field(
-        default_factory=str,
-        description="""
-Fully-qualified name of the module to import. The module is expected to have:
-
- - `get_adapter_impl(config, deps)`: returns the adapter implementation
-""",
-    )
-    pip_packages: list[str] = Field(
-        default_factory=list,
-        description="The pip dependencies needed for this implementation",
-    )
-    config_class: str = Field(
-        description="Fully-qualified classname of the config for this provider",
-    )
-    provider_data_validator: str | None = Field(
-        default=None,
-    )
-    description: str | None = Field(
-        default=None,
-        description="""
-A description of the provider. This is used to display in the documentation.
-""",
-    )
-
-
@json_schema_type
 class InlineProviderSpec(ProviderSpec):
-    pip_packages: list[str] = Field(
-        default_factory=list,
-        description="The pip dependencies needed for this implementation",
-    )
    container_image: str | None = Field(
        default=None,
        description="""
@ -191,10 +163,6 @@ The container image to use for this implementation. If one is provided, pip_pack
 If a provider depends on other providers, the dependencies MUST NOT specify a container image.
 """,
    )
-    # module field is inherited from ProviderSpec
-    provider_data_validator: str | None = Field(
-        default=None,
-    )
    description: str | None = Field(
        default=None,
        description="""
@ -223,10 +191,15 @@ class RemoteProviderConfig(BaseModel):

@json_schema_type
 class RemoteProviderSpec(ProviderSpec):
-    adapter: AdapterSpec = Field(
+    adapter_type: str = Field(
+        ...,
+        description="Unique identifier for this adapter",
+    )
+
+    description: str | None = Field(
+        default=None,
        description="""
-If some code is needed to convert the remote responses into Llama Stack compatible
-API responses, specify the adapter here.
+A description of the provider. This is used to display in the documentation.
 """,
    )

@ -234,33 +207,6 @@ API responses, specify the adapter here.
    def container_image(self) -> str | None:
        return None

-    # module field is inherited from ProviderSpec
-
-    @property
-    def pip_packages(self) -> list[str]:
-        return self.adapter.pip_packages
-
-    @property
-    def provider_data_validator(self) -> str | None:
-        return self.adapter.provider_data_validator
-
-
-def remote_provider_spec(
-    api: Api,
-    adapter: AdapterSpec,
-    api_dependencies: list[Api] | None = None,
-    optional_api_dependencies: list[Api] | None = None,
-) -> RemoteProviderSpec:
-    return RemoteProviderSpec(
-        api=api,
-        provider_type=f"remote::{adapter.adapter_type}",
-        config_class=adapter.config_class,
-        module=adapter.module,
-        adapter=adapter,
-        api_dependencies=api_dependencies or [],
-        optional_api_dependencies=optional_api_dependencies or [],
-    )
-

 class HealthStatus(StrEnum):
    OK = "OK"
--- a/llama_stack/providers/inline/eval/meta_reference/eval.py
+++ b/llama_stack/providers/inline/eval/meta_reference/eval.py
@ -75,6 +75,13 @@ class MetaReferenceEvalImpl(
        )
        self.benchmarks[task_def.identifier] = task_def

+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        if benchmark_id in self.benchmarks:
+            del self.benchmarks[benchmark_id]
+
+        key = f"{EVAL_TASKS_PREFIX}{benchmark_id}"
+        await self.kvstore.delete(key)
+
    async def run_eval(
        self,
        benchmark_id: str,
--- a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@ -63,6 +63,9 @@ class LlmAsJudgeScoringImpl(
    async def register_scoring_function(self, function_def: ScoringFn) -> None:
        self.llm_as_judge_fn.register_scoring_fn_def(function_def)

+    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+        self.llm_as_judge_fn.unregister_scoring_fn_def(scoring_fn_id)
+
    async def score_batch(
        self,
        dataset_id: str,
--- a/llama_stack/providers/registry/datasetio.py
+++ b/llama_stack/providers/registry/datasetio.py
@ -6,11 +6,10 @@


 from llama_stack.providers.datatypes import (
-    AdapterSpec,
    Api,
    InlineProviderSpec,
    ProviderSpec,
-    remote_provider_spec,
+    RemoteProviderSpec,
 )


@ -25,10 +24,10 @@ def available_providers() -> list[ProviderSpec]:
            api_dependencies=[],
            description="Local filesystem-based dataset I/O provider for reading and writing datasets to local storage.",
        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.datasetio,
-            adapter=AdapterSpec(
            adapter_type="huggingface",
+            provider_type="remote::huggingface",
            pip_packages=[
                "datasets>=4.0.0",
            ],
@ -36,17 +35,15 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.remote.datasetio.huggingface.HuggingfaceDatasetIOConfig",
            description="HuggingFace datasets provider for accessing and managing datasets from the HuggingFace Hub.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.datasetio,
-            adapter=AdapterSpec(
            adapter_type="nvidia",
+            provider_type="remote::nvidia",
+            module="llama_stack.providers.remote.datasetio.nvidia",
+            config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig",
            pip_packages=[
                "datasets>=4.0.0",
            ],
-                module="llama_stack.providers.remote.datasetio.nvidia",
-                config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig",
            description="NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data platform.",
        ),
-        ),
    ]
--- a/llama_stack/providers/registry/eval.py
+++ b/llama_stack/providers/registry/eval.py
@ -5,7 +5,7 @@
 # the root directory of this source tree.


-from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec
+from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec


 def available_providers() -> list[ProviderSpec]:
@ -25,17 +25,16 @@ def available_providers() -> list[ProviderSpec]:
            ],
            description="Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.",
        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.eval,
-            adapter=AdapterSpec(
            adapter_type="nvidia",
            pip_packages=[
                "requests",
            ],
+            provider_type="remote::nvidia",
            module="llama_stack.providers.remote.eval.nvidia",
            config_class="llama_stack.providers.remote.eval.nvidia.NVIDIAEvalConfig",
            description="NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.",
-            ),
            api_dependencies=[
                Api.datasetio,
                Api.datasets,
--- a/llama_stack/providers/registry/files.py
+++ b/llama_stack/providers/registry/files.py
@ -4,13 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.providers.datatypes import (
-    AdapterSpec,
-    Api,
-    InlineProviderSpec,
-    ProviderSpec,
-    remote_provider_spec,
-)
+from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec
 from llama_stack.providers.utils.sqlstore.sqlstore import sql_store_pip_packages


@ -25,14 +19,13 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.inline.files.localfs.config.LocalfsFilesImplConfig",
            description="Local filesystem-based file storage provider for managing files and documents locally.",
        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.files,
-            adapter=AdapterSpec(
+            provider_type="remote::s3",
            adapter_type="s3",
            pip_packages=["boto3"] + sql_store_pip_packages,
            module="llama_stack.providers.remote.files.s3",
            config_class="llama_stack.providers.remote.files.s3.config.S3FilesImplConfig",
            description="AWS S3-based file storage provider for scalable cloud file management with metadata persistence.",
        ),
-        ),
    ]
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -6,11 +6,10 @@


 from llama_stack.providers.datatypes import (
-    AdapterSpec,
    Api,
    InlineProviderSpec,
    ProviderSpec,
-    remote_provider_spec,
+    RemoteProviderSpec,
 )

 META_REFERENCE_DEPS = [
@ -49,10 +48,10 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.inline.inference.sentence_transformers.config.SentenceTransformersInferenceConfig",
            description="Sentence Transformers inference provider for text embeddings and similarity search.",
        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="cerebras",
+            provider_type="remote::cerebras",
            pip_packages=[
                "cerebras_cloud_sdk",
            ],
@ -60,61 +59,56 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.remote.inference.cerebras.CerebrasImplConfig",
            description="Cerebras inference provider for running models on Cerebras Cloud platform.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="ollama",
+            provider_type="remote::ollama",
            pip_packages=["ollama", "aiohttp", "h11>=0.16.0"],
            config_class="llama_stack.providers.remote.inference.ollama.OllamaImplConfig",
            module="llama_stack.providers.remote.inference.ollama",
            description="Ollama inference provider for running local models through the Ollama runtime.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="vllm",
+            provider_type="remote::vllm",
            pip_packages=[],
            module="llama_stack.providers.remote.inference.vllm",
            config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig",
+            provider_data_validator="llama_stack.providers.remote.inference.vllm.VLLMProviderDataValidator",
            description="Remote vLLM inference provider for connecting to vLLM servers.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="tgi",
+            provider_type="remote::tgi",
            pip_packages=["huggingface_hub", "aiohttp"],
            module="llama_stack.providers.remote.inference.tgi",
            config_class="llama_stack.providers.remote.inference.tgi.TGIImplConfig",
            description="Text Generation Inference (TGI) provider for HuggingFace model serving.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="hf::serverless",
+            provider_type="remote::hf::serverless",
            pip_packages=["huggingface_hub", "aiohttp"],
            module="llama_stack.providers.remote.inference.tgi",
            config_class="llama_stack.providers.remote.inference.tgi.InferenceAPIImplConfig",
            description="HuggingFace Inference API serverless provider for on-demand model inference.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
+            provider_type="remote::hf::endpoint",
            adapter_type="hf::endpoint",
            pip_packages=["huggingface_hub", "aiohttp"],
            module="llama_stack.providers.remote.inference.tgi",
            config_class="llama_stack.providers.remote.inference.tgi.InferenceEndpointImplConfig",
            description="HuggingFace Inference Endpoints provider for dedicated model serving.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="fireworks",
+            provider_type="remote::fireworks",
            pip_packages=[
                "fireworks-ai<=0.17.16",
            ],
@ -123,11 +117,10 @@ def available_providers() -> list[ProviderSpec]:
            provider_data_validator="llama_stack.providers.remote.inference.fireworks.FireworksProviderDataValidator",
            description="Fireworks AI inference provider for Llama models and other AI models on the Fireworks platform.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="together",
+            provider_type="remote::together",
            pip_packages=[
                "together",
            ],
@ -136,85 +129,82 @@ def available_providers() -> list[ProviderSpec]:
            provider_data_validator="llama_stack.providers.remote.inference.together.TogetherProviderDataValidator",
            description="Together AI inference provider for open-source models and collaborative AI development.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="bedrock",
+            provider_type="remote::bedrock",
            pip_packages=["boto3"],
            module="llama_stack.providers.remote.inference.bedrock",
            config_class="llama_stack.providers.remote.inference.bedrock.BedrockConfig",
            description="AWS Bedrock inference provider for accessing various AI models through AWS's managed service.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="databricks",
+            provider_type="remote::databricks",
            pip_packages=[],
            module="llama_stack.providers.remote.inference.databricks",
            config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig",
            description="Databricks inference provider for running models on Databricks' unified analytics platform.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="nvidia",
+            provider_type="remote::nvidia",
            pip_packages=[],
            module="llama_stack.providers.remote.inference.nvidia",
            config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig",
            description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="runpod",
+            provider_type="remote::runpod",
            pip_packages=[],
            module="llama_stack.providers.remote.inference.runpod",
            config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig",
            description="RunPod inference provider for running models on RunPod's cloud GPU platform.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="openai",
+            provider_type="remote::openai",
            pip_packages=["litellm"],
            module="llama_stack.providers.remote.inference.openai",
            config_class="llama_stack.providers.remote.inference.openai.OpenAIConfig",
            provider_data_validator="llama_stack.providers.remote.inference.openai.config.OpenAIProviderDataValidator",
            description="OpenAI inference provider for accessing GPT models and other OpenAI services.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="anthropic",
+            provider_type="remote::anthropic",
            pip_packages=["litellm"],
            module="llama_stack.providers.remote.inference.anthropic",
            config_class="llama_stack.providers.remote.inference.anthropic.AnthropicConfig",
            provider_data_validator="llama_stack.providers.remote.inference.anthropic.config.AnthropicProviderDataValidator",
            description="Anthropic inference provider for accessing Claude models and Anthropic's AI services.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="gemini",
-                pip_packages=["litellm"],
+            provider_type="remote::gemini",
+            pip_packages=[
+                "litellm",
+            ],
            module="llama_stack.providers.remote.inference.gemini",
            config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
            provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
            description="Google Gemini inference provider for accessing Gemini models and Google's AI services.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="vertexai",
-                pip_packages=["litellm", "google-cloud-aiplatform"],
+            provider_type="remote::vertexai",
+            pip_packages=[
+                "litellm",
+                "google-cloud-aiplatform",
+            ],
            module="llama_stack.providers.remote.inference.vertexai",
            config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig",
            provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator",
@ -239,65 +229,63 @@ Available Models:
 - vertex_ai/gemini-2.5-flash
 - vertex_ai/gemini-2.5-pro""",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="groq",
-                pip_packages=["litellm"],
+            provider_type="remote::groq",
+            pip_packages=[
+                "litellm",
+            ],
            module="llama_stack.providers.remote.inference.groq",
            config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
            provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
            description="Groq inference provider for ultra-fast inference using Groq's LPU technology.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="llama-openai-compat",
+            provider_type="remote::llama-openai-compat",
            pip_packages=["litellm"],
            module="llama_stack.providers.remote.inference.llama_openai_compat",
            config_class="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaCompatConfig",
            provider_data_validator="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaProviderDataValidator",
            description="Llama OpenAI-compatible provider for using Llama models with OpenAI API format.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="sambanova",
-                pip_packages=["litellm"],
+            provider_type="remote::sambanova",
+            pip_packages=[
+                "litellm",
+            ],
            module="llama_stack.providers.remote.inference.sambanova",
            config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
            provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
            description="SambaNova inference provider for running models on SambaNova's dataflow architecture.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="passthrough",
+            provider_type="remote::passthrough",
            pip_packages=[],
            module="llama_stack.providers.remote.inference.passthrough",
            config_class="llama_stack.providers.remote.inference.passthrough.PassthroughImplConfig",
            provider_data_validator="llama_stack.providers.remote.inference.passthrough.PassthroughProviderDataValidator",
            description="Passthrough inference provider for connecting to any external inference service not directly supported.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
            adapter_type="watsonx",
+            provider_type="remote::watsonx",
            pip_packages=["ibm_watsonx_ai"],
            module="llama_stack.providers.remote.inference.watsonx",
            config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig",
            provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator",
            description="IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.inference,
-            adapter=AdapterSpec(
+            provider_type="remote::azure",
            adapter_type="azure",
            pip_packages=["litellm"],
            module="llama_stack.providers.remote.inference.azure",
@ -309,5 +297,4 @@ Provider documentation
 https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
 """,
        ),
-        ),
    ]
--- a/llama_stack/providers/registry/post_training.py
+++ b/llama_stack/providers/registry/post_training.py
@ -7,7 +7,7 @@

 from typing import cast

-from llama_stack.providers.datatypes import AdapterSpec, Api, InlineProviderSpec, ProviderSpec, remote_provider_spec
+from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec

 # We provide two versions of these providers so that distributions can package the appropriate version of torch.
 # The CPU version is used for distributions that don't have GPU support -- they result in smaller container images.
@ -57,14 +57,13 @@ def available_providers() -> list[ProviderSpec]:
            ],
            description="HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.",
        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.post_training,
-            adapter=AdapterSpec(
            adapter_type="nvidia",
+            provider_type="remote::nvidia",
            pip_packages=["requests", "aiohttp"],
            module="llama_stack.providers.remote.post_training.nvidia",
            config_class="llama_stack.providers.remote.post_training.nvidia.NvidiaPostTrainingConfig",
            description="NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.",
        ),
-        ),
    ]
--- a/llama_stack/providers/registry/safety.py
+++ b/llama_stack/providers/registry/safety.py
@ -6,11 +6,10 @@


 from llama_stack.providers.datatypes import (
-    AdapterSpec,
    Api,
    InlineProviderSpec,
    ProviderSpec,
-    remote_provider_spec,
+    RemoteProviderSpec,
 )


@ -48,35 +47,32 @@ def available_providers() -> list[ProviderSpec]:
            config_class="llama_stack.providers.inline.safety.code_scanner.CodeScannerConfig",
            description="Code Scanner safety provider for detecting security vulnerabilities and unsafe code patterns.",
        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.safety,
-            adapter=AdapterSpec(
            adapter_type="bedrock",
+            provider_type="remote::bedrock",
            pip_packages=["boto3"],
            module="llama_stack.providers.remote.safety.bedrock",
            config_class="llama_stack.providers.remote.safety.bedrock.BedrockSafetyConfig",
            description="AWS Bedrock safety provider for content moderation using AWS's safety services.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.safety,
-            adapter=AdapterSpec(
            adapter_type="nvidia",
+            provider_type="remote::nvidia",
            pip_packages=["requests"],
            module="llama_stack.providers.remote.safety.nvidia",
            config_class="llama_stack.providers.remote.safety.nvidia.NVIDIASafetyConfig",
            description="NVIDIA's safety provider for content moderation and safety filtering.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.safety,
-            adapter=AdapterSpec(
            adapter_type="sambanova",
+            provider_type="remote::sambanova",
            pip_packages=["litellm", "requests"],
            module="llama_stack.providers.remote.safety.sambanova",
            config_class="llama_stack.providers.remote.safety.sambanova.SambaNovaSafetyConfig",
            provider_data_validator="llama_stack.providers.remote.safety.sambanova.config.SambaNovaProviderDataValidator",
            description="SambaNova's safety provider for content moderation and safety filtering.",
        ),
-        ),
    ]
--- a/llama_stack/providers/registry/tool_runtime.py
+++ b/llama_stack/providers/registry/tool_runtime.py
@ -6,11 +6,10 @@


 from llama_stack.providers.datatypes import (
-    AdapterSpec,
    Api,
    InlineProviderSpec,
    ProviderSpec,
-    remote_provider_spec,
+    RemoteProviderSpec,
 )


@ -35,59 +34,54 @@ def available_providers() -> list[ProviderSpec]:
            api_dependencies=[Api.vector_io, Api.inference, Api.files],
            description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.",
        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.tool_runtime,
-            adapter=AdapterSpec(
            adapter_type="brave-search",
+            provider_type="remote::brave-search",
            module="llama_stack.providers.remote.tool_runtime.brave_search",
            config_class="llama_stack.providers.remote.tool_runtime.brave_search.config.BraveSearchToolConfig",
            pip_packages=["requests"],
            provider_data_validator="llama_stack.providers.remote.tool_runtime.brave_search.BraveSearchToolProviderDataValidator",
            description="Brave Search tool for web search capabilities with privacy-focused results.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.tool_runtime,
-            adapter=AdapterSpec(
            adapter_type="bing-search",
+            provider_type="remote::bing-search",
            module="llama_stack.providers.remote.tool_runtime.bing_search",
            config_class="llama_stack.providers.remote.tool_runtime.bing_search.config.BingSearchToolConfig",
            pip_packages=["requests"],
            provider_data_validator="llama_stack.providers.remote.tool_runtime.bing_search.BingSearchToolProviderDataValidator",
            description="Bing Search tool for web search capabilities using Microsoft's search engine.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.tool_runtime,
-            adapter=AdapterSpec(
            adapter_type="tavily-search",
+            provider_type="remote::tavily-search",
            module="llama_stack.providers.remote.tool_runtime.tavily_search",
            config_class="llama_stack.providers.remote.tool_runtime.tavily_search.config.TavilySearchToolConfig",
            pip_packages=["requests"],
            provider_data_validator="llama_stack.providers.remote.tool_runtime.tavily_search.TavilySearchToolProviderDataValidator",
            description="Tavily Search tool for AI-optimized web search with structured results.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.tool_runtime,
-            adapter=AdapterSpec(
            adapter_type="wolfram-alpha",
+            provider_type="remote::wolfram-alpha",
            module="llama_stack.providers.remote.tool_runtime.wolfram_alpha",
            config_class="llama_stack.providers.remote.tool_runtime.wolfram_alpha.config.WolframAlphaToolConfig",
            pip_packages=["requests"],
            provider_data_validator="llama_stack.providers.remote.tool_runtime.wolfram_alpha.WolframAlphaToolProviderDataValidator",
            description="Wolfram Alpha tool for computational knowledge and mathematical calculations.",
        ),
-        ),
-        remote_provider_spec(
+        RemoteProviderSpec(
            api=Api.tool_runtime,
-            adapter=AdapterSpec(
            adapter_type="model-context-protocol",
+            provider_type="remote::model-context-protocol",
            module="llama_stack.providers.remote.tool_runtime.model_context_protocol",
            config_class="llama_stack.providers.remote.tool_runtime.model_context_protocol.config.MCPProviderConfig",
            pip_packages=["mcp>=1.8.1"],
            provider_data_validator="llama_stack.providers.remote.tool_runtime.model_context_protocol.config.MCPProviderDataValidator",
            description="Model Context Protocol (MCP) tool for standardized tool calling and context management.",
        ),
-        ),
    ]
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@ -6,11 +6,10 @@


 from llama_stack.providers.datatypes import (
-    AdapterSpec,
    Api,
    InlineProviderSpec,
    ProviderSpec,
-    remote_provider_spec,
+    RemoteProviderSpec,
 )


@ -300,13 +299,15 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f
 Please refer to the sqlite-vec provider documentation.
 """,
        ),
-        remote_provider_spec(
-            Api.vector_io,
-            AdapterSpec(
+        RemoteProviderSpec(
+            api=Api.vector_io,
            adapter_type="chromadb",
+            provider_type="remote::chromadb",
            pip_packages=["chromadb-client"],
            module="llama_stack.providers.remote.vector_io.chroma",
            config_class="llama_stack.providers.remote.vector_io.chroma.ChromaVectorIOConfig",
+            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
            description="""
 [Chroma](https://www.trychroma.com/) is an inline and remote vector
 database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
@ -341,9 +342,6 @@ pip install chromadb
 See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introduction) for more details about Chroma in general.
 """,
        ),
-            api_dependencies=[Api.inference],
-            optional_api_dependencies=[Api.files],
-        ),
        InlineProviderSpec(
            api=Api.vector_io,
            provider_type="inline::chromadb",
@ -387,13 +385,15 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti

 """,
        ),
-        remote_provider_spec(
-            Api.vector_io,
-            AdapterSpec(
+        RemoteProviderSpec(
+            api=Api.vector_io,
            adapter_type="pgvector",
+            provider_type="remote::pgvector",
            pip_packages=["psycopg2-binary"],
            module="llama_stack.providers.remote.vector_io.pgvector",
            config_class="llama_stack.providers.remote.vector_io.pgvector.PGVectorVectorIOConfig",
+            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
            description="""
 [PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It
 allows you to store and query vectors directly in memory.
@ -496,17 +496,16 @@ docker pull pgvector/pgvector:pg17
 See [PGVector's documentation](https://github.com/pgvector/pgvector) for more details about PGVector in general.
 """,
        ),
-            api_dependencies=[Api.inference],
-            optional_api_dependencies=[Api.files],
-        ),
-        remote_provider_spec(
-            Api.vector_io,
-            AdapterSpec(
+        RemoteProviderSpec(
+            api=Api.vector_io,
            adapter_type="weaviate",
+            provider_type="remote::weaviate",
            pip_packages=["weaviate-client"],
            module="llama_stack.providers.remote.vector_io.weaviate",
            config_class="llama_stack.providers.remote.vector_io.weaviate.WeaviateVectorIOConfig",
            provider_data_validator="llama_stack.providers.remote.vector_io.weaviate.WeaviateRequestProviderData",
+            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
            description="""
 [Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack.
 It allows you to store and query vectors directly within a Weaviate database.
@ -539,9 +538,6 @@ To install Weaviate see the [Weaviate quickstart documentation](https://weaviate
 See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general.
 """,
        ),
-            api_dependencies=[Api.inference],
-            optional_api_dependencies=[Api.files],
-        ),
        InlineProviderSpec(
            api=Api.vector_io,
            provider_type="inline::qdrant",
@ -594,27 +590,28 @@ docker pull qdrant/qdrant
 See the [Qdrant documentation](https://qdrant.tech/documentation/) for more details about Qdrant in general.
 """,
        ),
-        remote_provider_spec(
-            Api.vector_io,
-            AdapterSpec(
+        RemoteProviderSpec(
+            api=Api.vector_io,
            adapter_type="qdrant",
+            provider_type="remote::qdrant",
            pip_packages=["qdrant-client"],
            module="llama_stack.providers.remote.vector_io.qdrant",
            config_class="llama_stack.providers.remote.vector_io.qdrant.QdrantVectorIOConfig",
+            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
            description="""
 Please refer to the inline provider documentation.
 """,
        ),
-            api_dependencies=[Api.inference],
-            optional_api_dependencies=[Api.files],
-        ),
-        remote_provider_spec(
-            Api.vector_io,
-            AdapterSpec(
+        RemoteProviderSpec(
+            api=Api.vector_io,
            adapter_type="milvus",
+            provider_type="remote::milvus",
            pip_packages=["pymilvus>=2.4.10"],
            module="llama_stack.providers.remote.vector_io.milvus",
            config_class="llama_stack.providers.remote.vector_io.milvus.MilvusVectorIOConfig",
+            api_dependencies=[Api.inference],
+            optional_api_dependencies=[Api.files],
            description="""
 [Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
 allows you to store and query vectors directly within a Milvus database.
@ -807,9 +804,6 @@ See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for m
 For more details on TLS configuration, refer to the [TLS setup guide](https://milvus.io/docs/tls.md).
 """,
        ),
-            api_dependencies=[Api.inference],
-            optional_api_dependencies=[Api.files],
-        ),
        InlineProviderSpec(
            api=Api.vector_io,
            provider_type="inline::milvus",
--- a/llama_stack/providers/remote/eval/nvidia/eval.py
+++ b/llama_stack/providers/remote/eval/nvidia/eval.py
@ -51,18 +51,23 @@ class NVIDIAEvalImpl(

    async def shutdown(self) -> None: ...

-    async def _evaluator_get(self, path):
+    async def _evaluator_get(self, path: str):
        """Helper for making GET requests to the evaluator service."""
        response = requests.get(url=f"{self.config.evaluator_url}{path}")
        response.raise_for_status()
        return response.json()

-    async def _evaluator_post(self, path, data):
+    async def _evaluator_post(self, path: str, data: dict[str, Any]):
        """Helper for making POST requests to the evaluator service."""
        response = requests.post(url=f"{self.config.evaluator_url}{path}", json=data)
        response.raise_for_status()
        return response.json()

+    async def _evaluator_delete(self, path: str) -> None:
+        """Helper for making DELETE requests to the evaluator service."""
+        response = requests.delete(url=f"{self.config.evaluator_url}{path}")
+        response.raise_for_status()
+
    async def register_benchmark(self, task_def: Benchmark) -> None:
        """Register a benchmark as an evaluation configuration."""
        await self._evaluator_post(
@ -75,6 +80,10 @@ class NVIDIAEvalImpl(
            },
        )

+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        """Unregister a benchmark evaluation configuration from NeMo Evaluator."""
+        await self._evaluator_delete(f"/v1/evaluation/configs/{DEFAULT_NAMESPACE}/{benchmark_id}")
+
    async def run_eval(
        self,
        benchmark_id: str,
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -7,12 +7,10 @@

 import asyncio
 import base64
-import uuid
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any

-from ollama import AsyncClient  # type: ignore[attr-defined]
-from openai import AsyncOpenAI
+from ollama import AsyncClient as AsyncOllamaClient

 from llama_stack.apis.common.content_types import (
    ImageContentItem,
@ -37,9 +35,6 @@ from llama_stack.apis.inference import (
    Message,
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
-    OpenAICompletion,
-    OpenAIEmbeddingsResponse,
-    OpenAIEmbeddingUsage,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    ResponseFormat,
@ -64,15 +59,14 @@ from llama_stack.providers.utils.inference.model_registry import (
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
-    b64_encode_openai_embeddings_response,
    get_sampling_options,
    prepare_openai_completion_params,
-    prepare_openai_embeddings_params,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
    process_completion_stream_response,
 )
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack.providers.utils.inference.prompt_adapter import (
    chat_completion_request_to_prompt,
    completion_request_to_prompt,
@ -89,6 +83,7 @@ logger = get_logger(name=__name__, category="inference::ollama")


 class OllamaInferenceAdapter(
+    OpenAIMixin,
    InferenceProvider,
    ModelsProtocolPrivate,
 ):
@ -98,23 +93,21 @@ class OllamaInferenceAdapter(
    def __init__(self, config: OllamaImplConfig) -> None:
        self.register_helper = ModelRegistryHelper(MODEL_ENTRIES)
        self.config = config
-        self._clients: dict[asyncio.AbstractEventLoop, AsyncClient] = {}
-        self._openai_client = None
+        self._clients: dict[asyncio.AbstractEventLoop, AsyncOllamaClient] = {}

    @property
-    def client(self) -> AsyncClient:
+    def ollama_client(self) -> AsyncOllamaClient:
        # ollama client attaches itself to the current event loop (sadly?)
        loop = asyncio.get_running_loop()
        if loop not in self._clients:
-            self._clients[loop] = AsyncClient(host=self.config.url)
+            self._clients[loop] = AsyncOllamaClient(host=self.config.url)
        return self._clients[loop]

-    @property
-    def openai_client(self) -> AsyncOpenAI:
-        if self._openai_client is None:
-            url = self.config.url.rstrip("/")
-            self._openai_client = AsyncOpenAI(base_url=f"{url}/v1", api_key="ollama")
-        return self._openai_client
+    def get_api_key(self):
+        return "NO_KEY"
+
+    def get_base_url(self):
+        return self.config.url.rstrip("/") + "/v1"

    async def initialize(self) -> None:
        logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
@ -129,7 +122,7 @@ class OllamaInferenceAdapter(

    async def list_models(self) -> list[Model] | None:
        provider_id = self.__provider_id__
-        response = await self.client.list()
+        response = await self.ollama_client.list()

        # always add the two embedding models which can be pulled on demand
        models = [
@ -189,7 +182,7 @@ class OllamaInferenceAdapter(
            HealthResponse: A dictionary containing the health status.
        """
        try:
-            await self.client.ps()
+            await self.ollama_client.ps()
            return HealthResponse(status=HealthStatus.OK)
        except Exception as e:
            return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
@ -238,7 +231,7 @@ class OllamaInferenceAdapter(
        params = await self._get_params(request)

        async def _generate_and_convert_to_openai_compat():
-            s = await self.client.generate(**params)
+            s = await self.ollama_client.generate(**params)
            async for chunk in s:
                choice = OpenAICompatCompletionChoice(
                    finish_reason=chunk["done_reason"] if chunk["done"] else None,
@ -254,7 +247,7 @@ class OllamaInferenceAdapter(

    async def _nonstream_completion(self, request: CompletionRequest) -> CompletionResponse:
        params = await self._get_params(request)
-        r = await self.client.generate(**params)
+        r = await self.ollama_client.generate(**params)

        choice = OpenAICompatCompletionChoice(
            finish_reason=r["done_reason"] if r["done"] else None,
@ -346,9 +339,9 @@ class OllamaInferenceAdapter(
    async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
        params = await self._get_params(request)
        if "messages" in params:
-            r = await self.client.chat(**params)
+            r = await self.ollama_client.chat(**params)
        else:
-            r = await self.client.generate(**params)
+            r = await self.ollama_client.generate(**params)

        if "message" in r:
            choice = OpenAICompatCompletionChoice(
@ -372,9 +365,9 @@ class OllamaInferenceAdapter(

        async def _generate_and_convert_to_openai_compat():
            if "messages" in params:
-                s = await self.client.chat(**params)
+                s = await self.ollama_client.chat(**params)
            else:
-                s = await self.client.generate(**params)
+                s = await self.ollama_client.generate(**params)
            async for chunk in s:
                if "message" in chunk:
                    choice = OpenAICompatCompletionChoice(
@ -407,7 +400,7 @@ class OllamaInferenceAdapter(
        assert all(not content_has_media(content) for content in contents), (
            "Ollama does not support media for embeddings"
        )
-        response = await self.client.embed(
+        response = await self.ollama_client.embed(
            model=model.provider_resource_id,
            input=[interleaved_content_as_str(content) for content in contents],
        )
@ -422,14 +415,14 @@ class OllamaInferenceAdapter(
            pass  # Ignore statically unknown model, will check live listing

        if model.model_type == ModelType.embedding:
-            response = await self.client.list()
+            response = await self.ollama_client.list()
            if model.provider_resource_id not in [m.model for m in response.models]:
-                await self.client.pull(model.provider_resource_id)
+                await self.ollama_client.pull(model.provider_resource_id)

        # we use list() here instead of ps() -
        #  - ps() only lists running models, not available models
        #  - models not currently running are run by the ollama server as needed
-        response = await self.client.list()
+        response = await self.ollama_client.list()
        available_models = [m.model for m in response.models]

        provider_resource_id = model.provider_resource_id
@ -448,90 +441,6 @@ class OllamaInferenceAdapter(

        return model

-    async def openai_embeddings(
-        self,
-        model: str,
-        input: str | list[str],
-        encoding_format: str | None = "float",
-        dimensions: int | None = None,
-        user: str | None = None,
-    ) -> OpenAIEmbeddingsResponse:
-        model_obj = await self._get_model(model)
-        if model_obj.provider_resource_id is None:
-            raise ValueError(f"Model {model} has no provider_resource_id set")
-
-        # Note, at the moment Ollama does not support encoding_format, dimensions, and user parameters
-        params = prepare_openai_embeddings_params(
-            model=model_obj.provider_resource_id,
-            input=input,
-            encoding_format=encoding_format,
-            dimensions=dimensions,
-            user=user,
-        )
-
-        response = await self.openai_client.embeddings.create(**params)
-        data = b64_encode_openai_embeddings_response(response.data, encoding_format)
-
-        usage = OpenAIEmbeddingUsage(
-            prompt_tokens=response.usage.prompt_tokens,
-            total_tokens=response.usage.total_tokens,
-        )
-        # TODO: Investigate why model_obj.identifier is used instead of response.model
-        return OpenAIEmbeddingsResponse(
-            data=data,
-            model=model_obj.identifier,
-            usage=usage,
-        )
-
-    async def openai_completion(
-        self,
-        model: str,
-        prompt: str | list[str] | list[int] | list[list[int]],
-        best_of: int | None = None,
-        echo: bool | None = None,
-        frequency_penalty: float | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        presence_penalty: float | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-        guided_choice: list[str] | None = None,
-        prompt_logprobs: int | None = None,
-        suffix: str | None = None,
-    ) -> OpenAICompletion:
-        if not isinstance(prompt, str):
-            raise ValueError("Ollama does not support non-string prompts for completion")
-
-        model_obj = await self._get_model(model)
-        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
-            prompt=prompt,
-            best_of=best_of,
-            echo=echo,
-            frequency_penalty=frequency_penalty,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_tokens=max_tokens,
-            n=n,
-            presence_penalty=presence_penalty,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            top_p=top_p,
-            user=user,
-            suffix=suffix,
-        )
-        return await self.openai_client.completions.create(**params)  # type: ignore
-
    async def openai_chat_completion(
        self,
        model: str,
@ -599,25 +508,7 @@ class OllamaInferenceAdapter(
            top_p=top_p,
            user=user,
        )
-        response = await self.openai_client.chat.completions.create(**params)
-        return await self._adjust_ollama_chat_completion_response_ids(response)
-
-    async def _adjust_ollama_chat_completion_response_ids(
-        self,
-        response: OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk],
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        id = f"chatcmpl-{uuid.uuid4()}"
-        if isinstance(response, AsyncIterator):
-
-            async def stream_with_chunk_ids() -> AsyncIterator[OpenAIChatCompletionChunk]:
-                async for chunk in response:
-                    chunk.id = id
-                    yield chunk
-
-            return stream_with_chunk_ids()
-        else:
-            response.id = id
-            return response
+        return await OpenAIMixin.openai_chat_completion(self, **params)


 async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -8,6 +8,7 @@
 from collections.abc import AsyncGenerator

 from huggingface_hub import AsyncInferenceClient, HfApi
+from pydantic import SecretStr

 from llama_stack.apis.common.content_types import (
    InterleavedContent,
@ -33,6 +34,7 @@ from llama_stack.apis.inference import (
    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model
+from llama_stack.apis.models.models import ModelType
 from llama_stack.log import get_logger
 from llama_stack.models.llama.sku_list import all_registered_models
 from llama_stack.providers.datatypes import ModelsProtocolPrivate
@ -41,16 +43,15 @@ from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
-    OpenAIChatCompletionToLlamaStackMixin,
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
-    OpenAICompletionToLlamaStackMixin,
    get_sampling_options,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
    process_completion_stream_response,
 )
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack.providers.utils.inference.prompt_adapter import (
    chat_completion_request_to_model_input_info,
    completion_request_to_prompt_model_input_info,
@ -73,26 +74,49 @@ def build_hf_repo_model_entries():


 class _HfAdapter(
+    OpenAIMixin,
    Inference,
-    OpenAIChatCompletionToLlamaStackMixin,
-    OpenAICompletionToLlamaStackMixin,
    ModelsProtocolPrivate,
 ):
-    client: AsyncInferenceClient
+    url: str
+    api_key: SecretStr
+
+    hf_client: AsyncInferenceClient
    max_tokens: int
    model_id: str

+    overwrite_completion_id = True  # TGI always returns id=""
+
    def __init__(self) -> None:
        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
        self.huggingface_repo_to_llama_model_id = {
            model.huggingface_repo: model.descriptor() for model in all_registered_models() if model.huggingface_repo
        }

+    def get_api_key(self):
+        return self.api_key.get_secret_value()
+
+    def get_base_url(self):
+        return self.url
+
    async def shutdown(self) -> None:
        pass

+    async def list_models(self) -> list[Model] | None:
+        models = []
+        async for model in self.client.models.list():
+            models.append(
+                Model(
+                    identifier=model.id,
+                    provider_resource_id=model.id,
+                    provider_id=self.__provider_id__,
+                    metadata={},
+                    model_type=ModelType.llm,
+                )
+            )
+        return models
+
    async def register_model(self, model: Model) -> Model:
-        model = await self.register_helper.register_model(model)
        if model.provider_resource_id != self.model_id:
            raise ValueError(
                f"Model {model.provider_resource_id} does not match the model {self.model_id} served by TGI."
@ -176,7 +200,7 @@ class _HfAdapter(
        params = await self._get_params_for_completion(request)

        async def _generate_and_convert_to_openai_compat():
-            s = await self.client.text_generation(**params)
+            s = await self.hf_client.text_generation(**params)
            async for chunk in s:
                token_result = chunk.token
                finish_reason = None
@ -194,7 +218,7 @@ class _HfAdapter(

    async def _nonstream_completion(self, request: CompletionRequest) -> AsyncGenerator:
        params = await self._get_params_for_completion(request)
-        r = await self.client.text_generation(**params)
+        r = await self.hf_client.text_generation(**params)

        choice = OpenAICompatCompletionChoice(
            finish_reason=r.details.finish_reason,
@ -241,7 +265,7 @@ class _HfAdapter(

    async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
        params = await self._get_params(request)
-        r = await self.client.text_generation(**params)
+        r = await self.hf_client.text_generation(**params)

        choice = OpenAICompatCompletionChoice(
            finish_reason=r.details.finish_reason,
@ -256,7 +280,7 @@ class _HfAdapter(
        params = await self._get_params(request)

        async def _generate_and_convert_to_openai_compat():
-            s = await self.client.text_generation(**params)
+            s = await self.hf_client.text_generation(**params)
            async for chunk in s:
                token_result = chunk.token

@ -308,18 +332,21 @@ class TGIAdapter(_HfAdapter):
        if not config.url:
            raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.")
        log.info(f"Initializing TGI client with url={config.url}")
-        self.client = AsyncInferenceClient(model=config.url, provider="hf-inference")
-        endpoint_info = await self.client.get_endpoint_info()
+        self.hf_client = AsyncInferenceClient(model=config.url, provider="hf-inference")
+        endpoint_info = await self.hf_client.get_endpoint_info()
        self.max_tokens = endpoint_info["max_total_tokens"]
        self.model_id = endpoint_info["model_id"]
+        self.url = f"{config.url.rstrip('/')}/v1"
+        self.api_key = SecretStr("NO_KEY")


 class InferenceAPIAdapter(_HfAdapter):
    async def initialize(self, config: InferenceAPIImplConfig) -> None:
-        self.client = AsyncInferenceClient(model=config.huggingface_repo, token=config.api_token.get_secret_value())
-        endpoint_info = await self.client.get_endpoint_info()
+        self.hf_client = AsyncInferenceClient(model=config.huggingface_repo, token=config.api_token.get_secret_value())
+        endpoint_info = await self.hf_client.get_endpoint_info()
        self.max_tokens = endpoint_info["max_total_tokens"]
        self.model_id = endpoint_info["model_id"]
+        # TODO: how do we set url for this?


 class InferenceEndpointAdapter(_HfAdapter):
@ -331,6 +358,7 @@ class InferenceEndpointAdapter(_HfAdapter):
        endpoint.wait(timeout=60)

        # Initialize the adapter
-        self.client = endpoint.async_client
+        self.hf_client = endpoint.async_client
        self.model_id = endpoint.repository
        self.max_tokens = int(endpoint.raw["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"])
+        # TODO: how do we set url for this?
--- a/llama_stack/providers/remote/inference/together/models.py
+++ b/llama_stack/providers/remote/inference/together/models.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.models import ModelType
 from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    ProviderModelEntry,
@ -21,7 +20,47 @@ SAFETY_MODELS_ENTRIES = [
        CoreModelId.llama_guard_3_11b_vision.value,
    ),
 ]
-MODEL_ENTRIES = [
+
+# source: https://docs.together.ai/docs/serverless-models#embedding-models
+EMBEDDING_MODEL_ENTRIES = {
+    "togethercomputer/m2-bert-80M-32k-retrieval": ProviderModelEntry(
+        provider_model_id="togethercomputer/m2-bert-80M-32k-retrieval",
+        metadata={
+            "embedding_dimension": 768,
+            "context_length": 32768,
+        },
+    ),
+    "BAAI/bge-large-en-v1.5": ProviderModelEntry(
+        provider_model_id="BAAI/bge-large-en-v1.5",
+        metadata={
+            "embedding_dimension": 1024,
+            "context_length": 512,
+        },
+    ),
+    "BAAI/bge-base-en-v1.5": ProviderModelEntry(
+        provider_model_id="BAAI/bge-base-en-v1.5",
+        metadata={
+            "embedding_dimension": 768,
+            "context_length": 512,
+        },
+    ),
+    "Alibaba-NLP/gte-modernbert-base": ProviderModelEntry(
+        provider_model_id="Alibaba-NLP/gte-modernbert-base",
+        metadata={
+            "embedding_dimension": 768,
+            "context_length": 8192,
+        },
+    ),
+    "intfloat/multilingual-e5-large-instruct": ProviderModelEntry(
+        provider_model_id="intfloat/multilingual-e5-large-instruct",
+        metadata={
+            "embedding_dimension": 1024,
+            "context_length": 512,
+        },
+    ),
+}
+MODEL_ENTRIES = (
+    [
        build_hf_repo_model_entry(
            "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
            CoreModelId.llama3_1_8b_instruct.value,
@ -50,22 +89,6 @@ MODEL_ENTRIES = [
            "meta-llama/Llama-3.3-70B-Instruct-Turbo",
            CoreModelId.llama3_3_70b_instruct.value,
        ),
-    ProviderModelEntry(
-        provider_model_id="togethercomputer/m2-bert-80M-8k-retrieval",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 768,
-            "context_length": 8192,
-        },
-    ),
-    ProviderModelEntry(
-        provider_model_id="togethercomputer/m2-bert-80M-32k-retrieval",
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 768,
-            "context_length": 32768,
-        },
-    ),
        build_hf_repo_model_entry(
            "meta-llama/Llama-4-Scout-17B-16E-Instruct",
            CoreModelId.llama4_scout_17b_16e_instruct.value,
@ -74,4 +97,7 @@ MODEL_ENTRIES = [
            "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
            CoreModelId.llama4_maverick_17b_128e_instruct.value,
        ),
-] + SAFETY_MODELS_ENTRIES
+    ]
+    + SAFETY_MODELS_ENTRIES
+    + list(EMBEDDING_MODEL_ENTRIES.values())
+)
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -4,11 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from collections.abc import AsyncGenerator, AsyncIterator
-from typing import Any
+from collections.abc import AsyncGenerator

-from openai import AsyncOpenAI
+from openai import NOT_GIVEN, AsyncOpenAI
 from together import AsyncTogether
+from together.constants import BASE_URL

 from llama_stack.apis.common.content_types import (
    InterleavedContent,
@ -23,12 +23,7 @@ from llama_stack.apis.inference import (
    Inference,
    LogProbConfig,
    Message,
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAICompletion,
    OpenAIEmbeddingsResponse,
-    OpenAIMessageParam,
-    OpenAIResponseFormatParam,
    ResponseFormat,
    ResponseFormatType,
    SamplingParams,
@ -38,18 +33,20 @@ from llama_stack.apis.inference import (
    ToolDefinition,
    ToolPromptFormat,
 )
+from llama_stack.apis.inference.inference import OpenAIEmbeddingUsage
+from llama_stack.apis.models import Model, ModelType
 from llama_stack.core.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
    convert_message_to_openai_dict,
    get_sampling_options,
-    prepare_openai_completion_params,
    process_chat_completion_response,
    process_chat_completion_stream_response,
    process_completion_response,
    process_completion_stream_response,
 )
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from llama_stack.providers.utils.inference.prompt_adapter import (
    chat_completion_request_to_prompt,
    completion_request_to_prompt,
@ -59,15 +56,22 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 )

 from .config import TogetherImplConfig
-from .models import MODEL_ENTRIES
+from .models import EMBEDDING_MODEL_ENTRIES, MODEL_ENTRIES

 logger = get_logger(name=__name__, category="inference::together")


-class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProviderData):
+class TogetherInferenceAdapter(OpenAIMixin, ModelRegistryHelper, Inference, NeedsRequestProviderData):
    def __init__(self, config: TogetherImplConfig) -> None:
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES, config.allowed_models)
        self.config = config
+        self._model_cache: dict[str, Model] = {}
+
+    def get_api_key(self):
+        return self.config.api_key.get_secret_value()
+
+    def get_base_url(self):
+        return BASE_URL

    async def initialize(self) -> None:
        pass
@ -255,6 +259,37 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        embeddings = [item.embedding for item in r.data]
        return EmbeddingsResponse(embeddings=embeddings)

+    async def list_models(self) -> list[Model] | None:
+        self._model_cache = {}
+        # Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client
+        for m in await self._get_client().models.list():
+            if m.type == "embedding":
+                if m.id not in EMBEDDING_MODEL_ENTRIES:
+                    logger.warning(f"Unknown embedding dimension for model {m.id}, skipping.")
+                    continue
+                self._model_cache[m.id] = Model(
+                    provider_id=self.__provider_id__,
+                    provider_resource_id=EMBEDDING_MODEL_ENTRIES[m.id].provider_model_id,
+                    identifier=m.id,
+                    model_type=ModelType.embedding,
+                    metadata=EMBEDDING_MODEL_ENTRIES[m.id].metadata,
+                )
+            else:
+                self._model_cache[m.id] = Model(
+                    provider_id=self.__provider_id__,
+                    provider_resource_id=m.id,
+                    identifier=m.id,
+                    model_type=ModelType.llm,
+                )
+
+        return self._model_cache.values()
+
+    async def should_refresh_models(self) -> bool:
+        return True
+
+    async def check_model_availability(self, model):
+        return model in self._model_cache
+
    async def openai_embeddings(
        self,
        model: str,
@ -263,125 +298,39 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
        dimensions: int | None = None,
        user: str | None = None,
    ) -> OpenAIEmbeddingsResponse:
-        raise NotImplementedError()
+        """
+        Together's OpenAI-compatible embeddings endpoint is not compatible with
+        the standard OpenAI embeddings endpoint.

-    async def openai_completion(
-        self,
-        model: str,
-        prompt: str | list[str] | list[int] | list[list[int]],
-        best_of: int | None = None,
-        echo: bool | None = None,
-        frequency_penalty: float | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        presence_penalty: float | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-        guided_choice: list[str] | None = None,
-        prompt_logprobs: int | None = None,
-        suffix: str | None = None,
-    ) -> OpenAICompletion:
-        model_obj = await self.model_store.get_model(model)
-        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
-            prompt=prompt,
-            best_of=best_of,
-            echo=echo,
-            frequency_penalty=frequency_penalty,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_tokens=max_tokens,
-            n=n,
-            presence_penalty=presence_penalty,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            top_p=top_p,
-            user=user,
+        The endpoint -
+         - does not return usage information
+         - does not support user param, returns 400 Unrecognized request arguments supplied: user
+         - does not support dimensions param, returns 400 Unrecognized request arguments supplied: dimensions
+         - does not support encoding_format param, always returns floats, never base64
+        """
+        # Together support ticket #13332 -> will not fix
+        if user is not None:
+            raise ValueError("Together's embeddings endpoint does not support user param.")
+        # Together support ticket #13333 -> escalated
+        if dimensions is not None:
+            raise ValueError("Together's embeddings endpoint does not support dimensions param.")
+        # Together support ticket #13331 -> will not fix, compute client side
+        if encoding_format not in (None, NOT_GIVEN, "float"):
+            raise ValueError("Together's embeddings endpoint only supports encoding_format='float'.")
+
+        response = await self.client.embeddings.create(
+            model=await self._get_provider_model_id(model),
+            input=input,
        )
-        return await self._get_openai_client().completions.create(**params)  # type: ignore

-    async def openai_chat_completion(
-        self,
-        model: str,
-        messages: list[OpenAIMessageParam],
-        frequency_penalty: float | None = None,
-        function_call: str | dict[str, Any] | None = None,
-        functions: list[dict[str, Any]] | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_completion_tokens: int | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        parallel_tool_calls: bool | None = None,
-        presence_penalty: float | None = None,
-        response_format: OpenAIResponseFormatParam | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        tool_choice: str | dict[str, Any] | None = None,
-        tools: list[dict[str, Any]] | None = None,
-        top_logprobs: int | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        model_obj = await self.model_store.get_model(model)
-        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
-            messages=messages,
-            frequency_penalty=frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_completion_tokens=max_completion_tokens,
-            max_tokens=max_tokens,
-            n=n,
-            parallel_tool_calls=parallel_tool_calls,
-            presence_penalty=presence_penalty,
-            response_format=response_format,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            tool_choice=tool_choice,
-            tools=tools,
-            top_logprobs=top_logprobs,
-            top_p=top_p,
-            user=user,
+        response.model = model  # return the user the same model id they provided, avoid exposing the provider model id
+
+        # Together support ticket #13330 -> escalated
+        #  - togethercomputer/m2-bert-80M-32k-retrieval *does not* return usage information
+        if not hasattr(response, "usage") or response.usage is None:
+            logger.warning(
+                f"Together's embedding endpoint for {model} did not return usage information, substituting -1s."
            )
-        if params.get("stream", False):
-            return self._stream_openai_chat_completion(params)
-        return await self._get_openai_client().chat.completions.create(**params)  # type: ignore
+            response.usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)

-    async def _stream_openai_chat_completion(self, params: dict) -> AsyncGenerator:
-        # together.ai sometimes adds usage data to the stream, even if include_usage is False
-        # This causes an unexpected final chunk with empty choices array to be sent
-        # to clients that may not handle it gracefully.
-        include_usage = False
-        if params.get("stream_options", None):
-            include_usage = params["stream_options"].get("include_usage", False)
-        stream = await self._get_openai_client().chat.completions.create(**params)
-
-        seen_finish_reason = False
-        async for chunk in stream:
-            # Final usage chunk with no choices that the user didn't request, so discard
-            if not include_usage and seen_finish_reason and len(chunk.choices) == 0:
-                break
-            yield chunk
-            for choice in chunk.choices:
-                if choice.finish_reason:
-                    seen_finish_reason = True
-                    break
+        return response
--- a/llama_stack/providers/remote/inference/vllm/init.py
+++ b/llama_stack/providers/remote/inference/vllm/init.py
@ -4,9 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from pydantic import BaseModel
+
 from .config import VLLMInferenceAdapterConfig


+class VLLMProviderDataValidator(BaseModel):
+    vllm_api_token: str | None = None
+
+
 async def get_adapter_impl(config: VLLMInferenceAdapterConfig, _deps):
    from .vllm import VLLMInferenceAdapter

--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -4,8 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any
+from urllib.parse import urljoin

 import httpx
 from openai import APIConnectionError, AsyncOpenAI
@ -55,6 +56,7 @@ from llama_stack.providers.datatypes import (
    HealthStatus,
    ModelsProtocolPrivate,
 )
+from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
    build_hf_repo_model_entry,
@ -62,6 +64,7 @@ from llama_stack.providers.utils.inference.model_registry import (
 from llama_stack.providers.utils.inference.openai_compat import (
    UnparseableToolCall,
    convert_message_to_openai_dict,
+    convert_openai_chat_completion_stream,
    convert_tool_call,
    get_sampling_options,
    process_chat_completion_stream_response,
@ -281,15 +284,31 @@ async def _process_vllm_chat_completion_stream_response(
        yield c


-class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
+class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsProtocolPrivate):
    # automatically set by the resolver when instantiating the provider
    __provider_id__: str
    model_store: ModelStore | None = None

    def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
+        LiteLLMOpenAIMixin.__init__(
+            self,
+            build_hf_repo_model_entries(),
+            litellm_provider_name="vllm",
+            api_key_from_config=config.api_token,
+            provider_data_api_key_field="vllm_api_token",
+            openai_compat_api_base=config.url,
+        )
        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
        self.config = config

+    get_api_key = LiteLLMOpenAIMixin.get_api_key
+
+    def get_base_url(self) -> str:
+        """Get the base URL from config."""
+        if not self.config.url:
+            raise ValueError("No base URL configured")
+        return self.config.url
+
    async def initialize(self) -> None:
        if not self.config.url:
            raise ValueError(
@ -297,6 +316,7 @@ class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
            )

    async def should_refresh_models(self) -> bool:
+        # Strictly respecting the refresh_models directive
        return self.config.refresh_models

    async def list_models(self) -> list[Model] | None:
@ -325,12 +345,18 @@ class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
        Performs a health check by verifying connectivity to the remote vLLM server.
        This method is used by the Provider API to verify
        that the service is running correctly.
+        Uses the unauthenticated /health endpoint.
        Returns:

            HealthResponse: A dictionary containing the health status.
        """
        try:
-            _ = [m async for m in self.client.models.list()]  # Ensure the client is initialized
+            base_url = self.get_base_url()
+            health_url = urljoin(base_url, "health")
+
+            async with httpx.AsyncClient() as client:
+                response = await client.get(health_url)
+                response.raise_for_status()
                return HealthResponse(status=HealthStatus.OK)
        except Exception as e:
            return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
@ -340,16 +366,10 @@ class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
            raise ValueError("Model store not set")
        return await self.model_store.get_model(model_id)

-    def get_api_key(self):
-        return self.config.api_token
-
-    def get_base_url(self):
-        return self.config.url
-
    def get_extra_client_params(self):
        return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)}

-    async def completion(
+    async def completion(  # type: ignore[override]  # Return type more specific than base class  which is allows for both streaming and non-streaming responses.
        self,
        model_id: str,
        content: InterleavedContent,
@ -411,13 +431,14 @@ class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
            tool_config=tool_config,
        )
        if stream:
-            return self._stream_chat_completion(request, self.client)
+            return self._stream_chat_completion_with_client(request, self.client)
        else:
            return await self._nonstream_chat_completion(request, self.client)

    async def _nonstream_chat_completion(
        self, request: ChatCompletionRequest, client: AsyncOpenAI
    ) -> ChatCompletionResponse:
+        assert self.client is not None
        params = await self._get_params(request)
        r = await client.chat.completions.create(**params)
        choice = r.choices[0]
@ -431,9 +452,24 @@ class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
        )
        return result

-    async def _stream_chat_completion(
+    async def _stream_chat_completion(self, response: Any) -> AsyncIterator[ChatCompletionResponseStreamChunk]:
+        # This method is called from LiteLLMOpenAIMixin.chat_completion
+        # The response parameter contains the litellm response
+        # We need to convert it to our format
+        async def _stream_generator():
+            async for chunk in response:
+                yield chunk
+
+        async for chunk in convert_openai_chat_completion_stream(
+            _stream_generator(), enable_incremental_tool_calls=True
+        ):
+            yield chunk
+
+    async def _stream_chat_completion_with_client(
        self, request: ChatCompletionRequest, client: AsyncOpenAI
    ) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
+        """Helper method for streaming with explicit client parameter."""
+        assert self.client is not None
        params = await self._get_params(request)

        stream = await client.chat.completions.create(**params)
@ -445,7 +481,8 @@ class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
            yield chunk

    async def _nonstream_completion(self, request: CompletionRequest) -> CompletionResponse:
-        assert self.client is not None
+        if self.client is None:
+            raise RuntimeError("Client is not initialized")
        params = await self._get_params(request)
        r = await self.client.completions.create(**params)
        return process_completion_response(r)
@ -453,7 +490,8 @@ class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
    async def _stream_completion(
        self, request: CompletionRequest
    ) -> AsyncGenerator[CompletionResponseStreamChunk, None]:
-        assert self.client is not None
+        if self.client is None:
+            raise RuntimeError("Client is not initialized")
        params = await self._get_params(request)

        stream = await self.client.completions.create(**params)
--- a/llama_stack/providers/remote/inference/watsonx/config.py
+++ b/llama_stack/providers/remote/inference/watsonx/config.py
@ -26,11 +26,11 @@ class WatsonXConfig(BaseModel):
    )
    api_key: SecretStr | None = Field(
        default_factory=lambda: os.getenv("WATSONX_API_KEY"),
-        description="The watsonx API key, only needed of using the hosted service",
+        description="The watsonx API key",
    )
    project_id: str | None = Field(
        default_factory=lambda: os.getenv("WATSONX_PROJECT_ID"),
-        description="The Project ID key, only needed of using the hosted service",
+        description="The Project ID key",
    )
    timeout: int = Field(
        default=60,
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -38,6 +38,7 @@ from llama_stack.apis.inference import (
    TopKSamplingStrategy,
    TopPSamplingStrategy,
 )
+from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
@ -57,14 +58,29 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from . import WatsonXConfig
 from .models import MODEL_ENTRIES

+logger = get_logger(name=__name__, category="inference::watsonx")
+
+
+# Note on structured output
+# WatsonX returns responses with a json embedded into a string.
+# Examples:
+
+# ChatCompletionResponse(completion_message=CompletionMessage(content='```json\n{\n
+# "first_name": "Michael",\n  "last_name": "Jordan",\n'...)
+# Not even a valid JSON, but we can still extract the JSON from the content
+
+# CompletionResponse(content=' \nThe best answer is $\\boxed{\\{"name": "Michael Jordan",
+# "year_born": "1963", "year_retired": "2003"\\}}$')
+# Find the start of the boxed content
+

 class WatsonXInferenceAdapter(Inference, ModelRegistryHelper):
    def __init__(self, config: WatsonXConfig) -> None:
        ModelRegistryHelper.__init__(self, MODEL_ENTRIES)

-        print(f"Initializing watsonx InferenceAdapter({config.url})...")
-
+        logger.info(f"Initializing watsonx InferenceAdapter({config.url})...")
        self._config = config
+        self._openai_client: AsyncOpenAI | None = None

        self._project_id = self._config.project_id

--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 import asyncio
+import hashlib
 import uuid
 from typing import Any

@ -49,10 +50,13 @@ def convert_id(_id: str) -> str:
    Converts any string into a UUID string based on a seed.

    Qdrant accepts UUID strings and unsigned integers as point ID.
-    We use a seed to convert each string into a UUID string deterministically.
+    We use a SHA-256 hash to convert each string into a UUID string deterministically.
    This allows us to overwrite the same point with the original ID.
    """
-    return str(uuid.uuid5(uuid.NAMESPACE_DNS, _id))
+    hash_input = f"qdrant_id:{_id}".encode()
+    sha256_hash = hashlib.sha256(hash_input).hexdigest()
+    # Use the first 32 characters to create a valid UUID
+    return str(uuid.UUID(sha256_hash[:32]))


 class QdrantIndex(EmbeddingIndex):
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@ -4,11 +4,11 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import uuid
 from abc import ABC, abstractmethod
 from collections.abc import AsyncIterator
 from typing import Any

-import openai
 from openai import NOT_GIVEN, AsyncOpenAI

 from llama_stack.apis.inference import (
@ -22,6 +22,7 @@ from llama_stack.apis.inference import (
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
 )
+from llama_stack.apis.models import ModelType
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params

@ -43,6 +44,16 @@ class OpenAIMixin(ABC):
      The model_store is set in routing_tables/common.py during provider initialization.
    """

+    # Allow subclasses to control whether to overwrite the 'id' field in OpenAI responses
+    # is overwritten with a client-side generated id.
+    #
+    # This is useful for providers that do not return a unique id in the response.
+    overwrite_completion_id: bool = False
+
+    # Cache of available models keyed by model ID
+    # This is set in list_models() and used in check_model_availability()
+    _model_cache: dict[str, Model] = {}
+
    @abstractmethod
    def get_api_key(self) -> str:
        """
@ -110,6 +121,23 @@ class OpenAIMixin(ABC):
            raise ValueError(f"Model {model} has no provider_resource_id")
        return model_obj.provider_resource_id

+    async def _maybe_overwrite_id(self, resp: Any, stream: bool | None) -> Any:
+        if not self.overwrite_completion_id:
+            return resp
+
+        new_id = f"cltsd-{uuid.uuid4()}"
+        if stream:
+
+            async def _gen():
+                async for chunk in resp:
+                    chunk.id = new_id
+                    yield chunk
+
+            return _gen()
+        else:
+            resp.id = new_id
+            return resp
+
    async def openai_completion(
        self,
        model: str,
@ -147,7 +175,7 @@ class OpenAIMixin(ABC):
            extra_body["guided_choice"] = guided_choice

        # TODO: fix openai_completion to return type compatible with OpenAI's API response
-        return await self.client.completions.create(  # type: ignore[no-any-return]
+        resp = await self.client.completions.create(
            **await prepare_openai_completion_params(
                model=await self._get_provider_model_id(model),
                prompt=prompt,
@ -171,6 +199,8 @@ class OpenAIMixin(ABC):
            extra_body=extra_body,
        )

+        return await self._maybe_overwrite_id(resp, stream)  # type: ignore[no-any-return]
+
    async def openai_chat_completion(
        self,
        model: str,
@ -200,8 +230,7 @@ class OpenAIMixin(ABC):
        """
        Direct OpenAI chat completion API call.
        """
-        # Type ignore because return types are compatible
-        return await self.client.chat.completions.create(  # type: ignore[no-any-return]
+        resp = await self.client.chat.completions.create(
            **await prepare_openai_completion_params(
                model=await self._get_provider_model_id(model),
                messages=messages,
@ -229,6 +258,8 @@ class OpenAIMixin(ABC):
            )
        )

+        return await self._maybe_overwrite_id(resp, stream)  # type: ignore[no-any-return]
+
    async def openai_embeddings(
        self,
        model: str,
@ -269,22 +300,35 @@ class OpenAIMixin(ABC):
            usage=usage,
        )

+    async def list_models(self) -> list[Model] | None:
+        """
+        List available models from the provider's /v1/models endpoint.
+
+        Also, caches the models in self._model_cache for use in check_model_availability().
+
+        :return: A list of Model instances representing available models.
+        """
+        self._model_cache = {
+            m.id: Model(
+                # __provider_id__ is dynamically added by instantiate_provider in resolver.py
+                provider_id=self.__provider_id__,  # type: ignore[attr-defined]
+                provider_resource_id=m.id,
+                identifier=m.id,
+                model_type=ModelType.llm,
+            )
+            async for m in self.client.models.list()
+        }
+
+        return list(self._model_cache.values())
+
    async def check_model_availability(self, model: str) -> bool:
        """
-        Check if a specific model is available from OpenAI.
+        Check if a specific model is available from the provider's /v1/models.

        :param model: The model identifier to check.
        :return: True if the model is available dynamically, False otherwise.
        """
-        try:
-            # Direct model lookup - returns model or raises NotFoundError
-            await self.client.models.retrieve(model)
-            return True
-        except openai.NotFoundError:
-            # Model doesn't exist - this is expected for unavailable models
-            pass
-        except Exception as e:
-            # All other errors (auth, rate limit, network, etc.)
-            logger.warning(f"Failed to check model availability for {model}: {e}")
+        if not self._model_cache:
+            await self.list_models()

-        return False
+        return model in self._model_cache
--- a/llama_stack/providers/utils/vector_io/vector_utils.py
+++ b/llama_stack/providers/utils/vector_io/vector_utils.py
@ -12,14 +12,12 @@ import uuid
 def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
    """
    Generate a unique chunk ID using a hash of the document ID and chunk text.
-
-    Note: MD5 is used only to calculate an identifier, not for security purposes.
-    Adding usedforsecurity=False for compatibility with FIPS environments.
+    Then use the first 32 characters of the hash to create a UUID.
    """
    hash_input = f"{document_id}:{chunk_text}".encode()
    if chunk_window:
        hash_input += f":{chunk_window}".encode()
-    return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest()))
+    return str(uuid.UUID(hashlib.sha256(hash_input).hexdigest()[:32]))


 def proper_case(s: str) -> str:
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -15,6 +15,8 @@ from enum import StrEnum
 from pathlib import Path
 from typing import Any, Literal, cast

+from openai import NOT_GIVEN
+
 from llama_stack.log import get_logger

 logger = get_logger(__name__, category="testing")
@ -198,20 +200,15 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:

        Supported endpoints:
        - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
-        - '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ]
+        - '/v1/models' (OpenAI): response body is: [ { id: ... }, ... ]
        Returns a list of unique identifiers or None if structure doesn't match.
        """
-        body = response["body"]
-        if endpoint == "/api/tags":
-            items = body.get("models")
-            idents = [m.model for m in items]
-        else:
-            items = body.get("data")
-            idents = [m.id for m in items]
+        items = response["body"]
+        idents = [m.model if endpoint == "/api/tags" else m.id for m in items]
        return sorted(set(idents))

    identifiers = _extract_model_identifiers()
-    return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
+    return hashlib.sha256(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]


 def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
@ -219,17 +216,12 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
    seen: dict[str, dict[str, Any]] = {}
    for rec in records:
        body = rec["response"]["body"]
-        if endpoint == "/api/tags":
-            items = body.models
-        elif endpoint == "/v1/models":
-            items = body.data
-        else:
-            items = []
-
-        for m in items:
        if endpoint == "/v1/models":
+            for m in body:
                key = m.id
-            else:
+                seen[key] = m
+        elif endpoint == "/api/tags":
+            for m in body.models:
                key = m.model
                seen[key] = m

@ -238,9 +230,8 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
    canonical_req = canonical.get("request", {})
    if isinstance(canonical_req, dict):
        canonical_req["endpoint"] = endpoint
-    if endpoint == "/v1/models":
-        body = {"data": ordered, "object": "list"}
-    else:
+    body = ordered
+    if endpoint == "/api/tags":
        from ollama import ListResponse

        body = ListResponse(models=ordered)
@ -251,12 +242,17 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
    global _current_mode, _current_storage

    if _current_mode == InferenceMode.LIVE or _current_storage is None:
-        # Normal operation
+        if endpoint == "/v1/models":
+            return original_method(self, *args, **kwargs)
+        else:
            return await original_method(self, *args, **kwargs)

    # Get base URL based on client type
    if client_type == "openai":
        base_url = str(self._client.base_url)
+
+        # the OpenAI client methods may pass NOT_GIVEN for unset parameters; filter these out
+        kwargs = {k: v for k, v in kwargs.items() if v is not NOT_GIVEN}
    elif client_type == "ollama":
        # Get base URL from the client (Ollama client uses host attribute)
        base_url = getattr(self, "host", "http://localhost:11434")
@ -300,8 +296,15 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
            )

    elif _current_mode == InferenceMode.RECORD:
+        if endpoint == "/v1/models":
+            response = original_method(self, *args, **kwargs)
+        else:
            response = await original_method(self, *args, **kwargs)

+        # we want to store the result of the iterator, not the iterator itself
+        if endpoint == "/v1/models":
+            response = [m async for m in response]
+
        request_data = {
            "method": method,
            "url": url,
@ -380,10 +383,14 @@ def patch_inference_clients():
            _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
        )

-    async def patched_models_list(self, *args, **kwargs):
-        return await _patched_inference_method(
+    def patched_models_list(self, *args, **kwargs):
+        async def _iter():
+            for item in await _patched_inference_method(
                _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
-        )
+            ):
+                yield item
+
+        return _iter()

    # Apply OpenAI patches
    AsyncChatCompletions.create = patched_chat_completions_create
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -11,16 +11,16 @@
        "@radix-ui/react-collapsible": "^1.1.12",
        "@radix-ui/react-dialog": "^1.1.13",
        "@radix-ui/react-dropdown-menu": "^2.1.16",
-        "@radix-ui/react-select": "^2.2.5",
+        "@radix-ui/react-select": "^2.2.6",
        "@radix-ui/react-separator": "^1.1.7",
        "@radix-ui/react-slot": "^1.2.3",
        "@radix-ui/react-tooltip": "^1.2.8",
        "class-variance-authority": "^0.7.1",
        "clsx": "^2.1.1",
        "framer-motion": "^12.23.12",
-        "llama-stack-client": "^0.2.21",
+        "llama-stack-client": "^0.2.22",
        "lucide-react": "^0.542.0",
-        "next": "15.3.3",
+        "next": "15.5.3",
        "next-auth": "^4.24.11",
        "next-themes": "^0.4.6",
        "react": "^19.0.0",
@ -664,9 +664,9 @@
      }
    },
    "node_modules/@emnapi/runtime": {
-      "version": "1.4.3",
-      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.4.3.tgz",
-      "integrity": "sha512-pBPWdu6MLKROBX05wSNKcNb++m5Er+KQ9QkB+WVM+pW2Kx9hoSrVTnu3BdkI5eBLZoKu/J6mW/B6i6bJB2ytXQ==",
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.5.0.tgz",
+      "integrity": "sha512-97/BJ3iXHww3djw6hYIfErCZFee7qCtrneuLa20UXFCOTCfBM2cvQHjWJ2EG0s0MtdNwInarqCTz35i4wWXHsQ==",
      "license": "MIT",
      "optional": true,
      "dependencies": {
@ -927,9 +927,9 @@
      }
    },
    "node_modules/@img/sharp-darwin-arm64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.1.tgz",
-      "integrity": "sha512-pn44xgBtgpEbZsu+lWf2KNb6OAf70X68k+yk69Ic2Xz11zHR/w24/U49XT7AeRwJ0Px+mhALhU5LPci1Aymk7A==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.3.tgz",
+      "integrity": "sha512-ryFMfvxxpQRsgZJqBd4wsttYQbCxsJksrv9Lw/v798JcQ8+w84mBWuXwl+TT0WJ/WrYOLaYpwQXi3sA9nTIaIg==",
      "cpu": [
        "arm64"
      ],
@ -945,13 +945,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-darwin-arm64": "1.1.0"
+        "@img/sharp-libvips-darwin-arm64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-darwin-x64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.1.tgz",
-      "integrity": "sha512-VfuYgG2r8BpYiOUN+BfYeFo69nP/MIwAtSJ7/Zpxc5QF3KS22z8Pvg3FkrSFJBPNQ7mmcUcYQFBmEQp7eu1F8Q==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.3.tgz",
+      "integrity": "sha512-yHpJYynROAj12TA6qil58hmPmAwxKKC7reUqtGLzsOHfP7/rniNGTL8tjWX6L3CTV4+5P4ypcS7Pp+7OB+8ihA==",
      "cpu": [
        "x64"
      ],
@ -967,13 +967,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-darwin-x64": "1.1.0"
+        "@img/sharp-libvips-darwin-x64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-libvips-darwin-arm64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.1.0.tgz",
-      "integrity": "sha512-HZ/JUmPwrJSoM4DIQPv/BfNh9yrOA8tlBbqbLz4JZ5uew2+o22Ik+tHQJcih7QJuSa0zo5coHTfD5J8inqj9DA==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.0.tgz",
+      "integrity": "sha512-sBZmpwmxqwlqG9ueWFXtockhsxefaV6O84BMOrhtg/YqbTaRdqDE7hxraVE3y6gVM4eExmfzW4a8el9ArLeEiQ==",
      "cpu": [
        "arm64"
      ],
@ -987,9 +987,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-darwin-x64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.1.0.tgz",
-      "integrity": "sha512-Xzc2ToEmHN+hfvsl9wja0RlnXEgpKNmftriQp6XzY/RaSfwD9th+MSh0WQKzUreLKKINb3afirxW7A0fz2YWuQ==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.0.tgz",
+      "integrity": "sha512-M64XVuL94OgiNHa5/m2YvEQI5q2cl9d/wk0qFTDVXcYzi43lxuiFTftMR1tOnFQovVXNZJ5TURSDK2pNe9Yzqg==",
      "cpu": [
        "x64"
      ],
@ -1003,9 +1003,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-arm": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.1.0.tgz",
-      "integrity": "sha512-s8BAd0lwUIvYCJyRdFqvsj+BJIpDBSxs6ivrOPm/R7piTs5UIwY5OjXrP2bqXC9/moGsyRa37eYWYCOGVXxVrA==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.0.tgz",
+      "integrity": "sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==",
      "cpu": [
        "arm"
      ],
@ -1019,9 +1019,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-arm64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.1.0.tgz",
-      "integrity": "sha512-IVfGJa7gjChDET1dK9SekxFFdflarnUB8PwW8aGwEoF3oAsSDuNUTYS+SKDOyOJxQyDC1aPFMuRYLoDInyV9Ew==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.0.tgz",
+      "integrity": "sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==",
      "cpu": [
        "arm64"
      ],
@ -1035,9 +1035,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-ppc64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.1.0.tgz",
-      "integrity": "sha512-tiXxFZFbhnkWE2LA8oQj7KYR+bWBkiV2nilRldT7bqoEZ4HiDOcePr9wVDAZPi/Id5fT1oY9iGnDq20cwUz8lQ==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.0.tgz",
+      "integrity": "sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==",
      "cpu": [
        "ppc64"
      ],
@ -1051,9 +1051,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-s390x": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.1.0.tgz",
-      "integrity": "sha512-xukSwvhguw7COyzvmjydRb3x/09+21HykyapcZchiCUkTThEQEOMtBj9UhkaBRLuBrgLFzQ2wbxdeCCJW/jgJA==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.0.tgz",
+      "integrity": "sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==",
      "cpu": [
        "s390x"
      ],
@ -1067,9 +1067,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linux-x64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.1.0.tgz",
-      "integrity": "sha512-yRj2+reB8iMg9W5sULM3S74jVS7zqSzHG3Ol/twnAAkAhnGQnpjj6e4ayUz7V+FpKypwgs82xbRdYtchTTUB+Q==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.0.tgz",
+      "integrity": "sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==",
      "cpu": [
        "x64"
      ],
@ -1083,9 +1083,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linuxmusl-arm64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.1.0.tgz",
-      "integrity": "sha512-jYZdG+whg0MDK+q2COKbYidaqW/WTz0cc1E+tMAusiDygrM4ypmSCjOJPmFTvHHJ8j/6cAGyeDWZOsK06tP33w==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.0.tgz",
+      "integrity": "sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==",
      "cpu": [
        "arm64"
      ],
@ -1099,9 +1099,9 @@
      }
    },
    "node_modules/@img/sharp-libvips-linuxmusl-x64": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.1.0.tgz",
-      "integrity": "sha512-wK7SBdwrAiycjXdkPnGCPLjYb9lD4l6Ze2gSdAGVZrEL05AOUJESWU2lhlC+Ffn5/G+VKuSm6zzbQSzFX/P65A==",
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.0.tgz",
+      "integrity": "sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==",
      "cpu": [
        "x64"
      ],
@ -1115,9 +1115,9 @@
      }
    },
    "node_modules/@img/sharp-linux-arm": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.1.tgz",
-      "integrity": "sha512-anKiszvACti2sGy9CirTlNyk7BjjZPiML1jt2ZkTdcvpLU1YH6CXwRAZCA2UmRXnhiIftXQ7+Oh62Ji25W72jA==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.3.tgz",
+      "integrity": "sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==",
      "cpu": [
        "arm"
      ],
@ -1133,13 +1133,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linux-arm": "1.1.0"
+        "@img/sharp-libvips-linux-arm": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linux-arm64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.1.tgz",
-      "integrity": "sha512-kX2c+vbvaXC6vly1RDf/IWNXxrlxLNpBVWkdpRq5Ka7OOKj6nr66etKy2IENf6FtOgklkg9ZdGpEu9kwdlcwOQ==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.3.tgz",
+      "integrity": "sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==",
      "cpu": [
        "arm64"
      ],
@ -1155,13 +1155,35 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linux-arm64": "1.1.0"
+        "@img/sharp-libvips-linux-arm64": "1.2.0"
+      }
+    },
+    "node_modules/@img/sharp-linux-ppc64": {
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.3.tgz",
+      "integrity": "sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==",
+      "cpu": [
+        "ppc64"
+      ],
+      "license": "Apache-2.0",
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/libvips"
+      },
+      "optionalDependencies": {
+        "@img/sharp-libvips-linux-ppc64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linux-s390x": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.1.tgz",
-      "integrity": "sha512-7s0KX2tI9mZI2buRipKIw2X1ufdTeaRgwmRabt5bi9chYfhur+/C1OXg3TKg/eag1W+6CCWLVmSauV1owmRPxA==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.3.tgz",
+      "integrity": "sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==",
      "cpu": [
        "s390x"
      ],
@ -1177,13 +1199,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linux-s390x": "1.1.0"
+        "@img/sharp-libvips-linux-s390x": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linux-x64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.1.tgz",
-      "integrity": "sha512-wExv7SH9nmoBW3Wr2gvQopX1k8q2g5V5Iag8Zk6AVENsjwd+3adjwxtp3Dcu2QhOXr8W9NusBU6XcQUohBZ5MA==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.3.tgz",
+      "integrity": "sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==",
      "cpu": [
        "x64"
      ],
@ -1199,13 +1221,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linux-x64": "1.1.0"
+        "@img/sharp-libvips-linux-x64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linuxmusl-arm64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.1.tgz",
-      "integrity": "sha512-DfvyxzHxw4WGdPiTF0SOHnm11Xv4aQexvqhRDAoD00MzHekAj9a/jADXeXYCDFH/DzYruwHbXU7uz+H+nWmSOQ==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.3.tgz",
+      "integrity": "sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==",
      "cpu": [
        "arm64"
      ],
@ -1221,13 +1243,13 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linuxmusl-arm64": "1.1.0"
+        "@img/sharp-libvips-linuxmusl-arm64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-linuxmusl-x64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.1.tgz",
-      "integrity": "sha512-pax/kTR407vNb9qaSIiWVnQplPcGU8LRIJpDT5o8PdAx5aAA7AS3X9PS8Isw1/WfqgQorPotjrZL3Pqh6C5EBg==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.3.tgz",
+      "integrity": "sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==",
      "cpu": [
        "x64"
      ],
@ -1243,20 +1265,20 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-libvips-linuxmusl-x64": "1.1.0"
+        "@img/sharp-libvips-linuxmusl-x64": "1.2.0"
      }
    },
    "node_modules/@img/sharp-wasm32": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.1.tgz",
-      "integrity": "sha512-YDybQnYrLQfEpzGOQe7OKcyLUCML4YOXl428gOOzBgN6Gw0rv8dpsJ7PqTHxBnXnwXr8S1mYFSLSa727tpz0xg==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.3.tgz",
+      "integrity": "sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==",
      "cpu": [
        "wasm32"
      ],
      "license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT",
      "optional": true,
      "dependencies": {
-        "@emnapi/runtime": "^1.4.0"
+        "@emnapi/runtime": "^1.4.4"
      },
      "engines": {
        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@ -1265,10 +1287,29 @@
        "url": "https://opencollective.com/libvips"
      }
    },
+    "node_modules/@img/sharp-win32-arm64": {
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.3.tgz",
+      "integrity": "sha512-MjnHPnbqMXNC2UgeLJtX4XqoVHHlZNd+nPt1kRPmj63wURegwBhZlApELdtxM2OIZDRv/DFtLcNhVbd1z8GYXQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "license": "Apache-2.0 AND LGPL-3.0-or-later",
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
+      },
+      "funding": {
+        "url": "https://opencollective.com/libvips"
+      }
+    },
    "node_modules/@img/sharp-win32-ia32": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.1.tgz",
-      "integrity": "sha512-WKf/NAZITnonBf3U1LfdjoMgNO5JYRSlhovhRhMxXVdvWYveM4kM3L8m35onYIdh75cOMCo1BexgVQcCDzyoWw==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.3.tgz",
+      "integrity": "sha512-xuCdhH44WxuXgOM714hn4amodJMZl3OEvf0GVTm0BEyMeA2to+8HEdRPShH0SLYptJY1uBw+SCFP9WVQi1Q/cw==",
      "cpu": [
        "ia32"
      ],
@ -1285,9 +1326,9 @@
      }
    },
    "node_modules/@img/sharp-win32-x64": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.1.tgz",
-      "integrity": "sha512-hw1iIAHpNE8q3uMIRCgGOeDoz9KtFNarFLQclLxr/LK1VBkj8nby18RjFvr6aP7USRYAjTZW6yisnBWMX571Tw==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.3.tgz",
+      "integrity": "sha512-OWwz05d++TxzLEv4VnsTz5CmZ6mI6S05sfQGEMrNrQcOEERbX46332IvE7pO/EUiw7jUrrS40z/M7kPyjfl04g==",
      "cpu": [
        "x64"
      ],
@ -1849,9 +1890,10 @@
      }
    },
    "node_modules/@next/env": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.3.3.tgz",
-      "integrity": "sha512-OdiMrzCl2Xi0VTjiQQUK0Xh7bJHnOuET2s+3V+Y40WJBAXrJeGA3f+I8MZJ/YQ3mVGi5XGR1L66oFlgqXhQ4Vw=="
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.5.3.tgz",
+      "integrity": "sha512-RSEDTRqyihYXygx/OJXwvVupfr9m04+0vH8vyy0HfZ7keRto6VX9BbEk0J2PUk0VGy6YhklJUSrgForov5F9pw==",
+      "license": "MIT"
    },
    "node_modules/@next/eslint-plugin-next": {
      "version": "15.5.2",
@ -1864,12 +1906,13 @@
      }
    },
    "node_modules/@next/swc-darwin-arm64": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.3.3.tgz",
-      "integrity": "sha512-WRJERLuH+O3oYB4yZNVahSVFmtxRNjNF1I1c34tYMoJb0Pve+7/RaLAJJizyYiFhjYNGHRAE1Ri2Fd23zgDqhg==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.5.3.tgz",
+      "integrity": "sha512-nzbHQo69+au9wJkGKTU9lP7PXv0d1J5ljFpvb+LnEomLtSbJkbZyEs6sbF3plQmiOB2l9OBtN2tNSvCH1nQ9Jg==",
      "cpu": [
        "arm64"
      ],
+      "license": "MIT",
      "optional": true,
      "os": [
        "darwin"
@ -1879,12 +1922,13 @@
      }
    },
    "node_modules/@next/swc-darwin-x64": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.3.3.tgz",
-      "integrity": "sha512-XHdzH/yBc55lu78k/XwtuFR/ZXUTcflpRXcsu0nKmF45U96jt1tsOZhVrn5YH+paw66zOANpOnFQ9i6/j+UYvw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.5.3.tgz",
+      "integrity": "sha512-w83w4SkOOhekJOcA5HBvHyGzgV1W/XvOfpkrxIse4uPWhYTTRwtGEM4v/jiXwNSJvfRvah0H8/uTLBKRXlef8g==",
      "cpu": [
        "x64"
      ],
+      "license": "MIT",
      "optional": true,
      "os": [
        "darwin"
@ -1894,12 +1938,13 @@
      }
    },
    "node_modules/@next/swc-linux-arm64-gnu": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.3.3.tgz",
-      "integrity": "sha512-VZ3sYL2LXB8znNGcjhocikEkag/8xiLgnvQts41tq6i+wql63SMS1Q6N8RVXHw5pEUjiof+II3HkDd7GFcgkzw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.5.3.tgz",
+      "integrity": "sha512-+m7pfIs0/yvgVu26ieaKrifV8C8yiLe7jVp9SpcIzg7XmyyNE7toC1fy5IOQozmr6kWl/JONC51osih2RyoXRw==",
      "cpu": [
        "arm64"
      ],
+      "license": "MIT",
      "optional": true,
      "os": [
        "linux"
@ -1909,12 +1954,13 @@
      }
    },
    "node_modules/@next/swc-linux-arm64-musl": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.3.3.tgz",
-      "integrity": "sha512-h6Y1fLU4RWAp1HPNJWDYBQ+e3G7sLckyBXhmH9ajn8l/RSMnhbuPBV/fXmy3muMcVwoJdHL+UtzRzs0nXOf9SA==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.5.3.tgz",
+      "integrity": "sha512-u3PEIzuguSenoZviZJahNLgCexGFhso5mxWCrrIMdvpZn6lkME5vc/ADZG8UUk5K1uWRy4hqSFECrON6UKQBbQ==",
      "cpu": [
        "arm64"
      ],
+      "license": "MIT",
      "optional": true,
      "os": [
        "linux"
@ -1924,12 +1970,13 @@
      }
    },
    "node_modules/@next/swc-linux-x64-gnu": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.3.3.tgz",
-      "integrity": "sha512-jJ8HRiF3N8Zw6hGlytCj5BiHyG/K+fnTKVDEKvUCyiQ/0r5tgwO7OgaRiOjjRoIx2vwLR+Rz8hQoPrnmFbJdfw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.5.3.tgz",
+      "integrity": "sha512-lDtOOScYDZxI2BENN9m0pfVPJDSuUkAD1YXSvlJF0DKwZt0WlA7T7o3wrcEr4Q+iHYGzEaVuZcsIbCps4K27sA==",
      "cpu": [
        "x64"
      ],
+      "license": "MIT",
      "optional": true,
      "os": [
        "linux"
@ -1939,12 +1986,13 @@
      }
    },
    "node_modules/@next/swc-linux-x64-musl": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.3.3.tgz",
-      "integrity": "sha512-HrUcTr4N+RgiiGn3jjeT6Oo208UT/7BuTr7K0mdKRBtTbT4v9zJqCDKO97DUqqoBK1qyzP1RwvrWTvU6EPh/Cw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.5.3.tgz",
+      "integrity": "sha512-9vWVUnsx9PrY2NwdVRJ4dUURAQ8Su0sLRPqcCCxtX5zIQUBES12eRVHq6b70bbfaVaxIDGJN2afHui0eDm+cLg==",
      "cpu": [
        "x64"
      ],
+      "license": "MIT",
      "optional": true,
      "os": [
        "linux"
@ -1954,12 +2002,13 @@
      }
    },
    "node_modules/@next/swc-win32-arm64-msvc": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.3.3.tgz",
-      "integrity": "sha512-SxorONgi6K7ZUysMtRF3mIeHC5aA3IQLmKFQzU0OuhuUYwpOBc1ypaLJLP5Bf3M9k53KUUUj4vTPwzGvl/NwlQ==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.5.3.tgz",
+      "integrity": "sha512-1CU20FZzY9LFQigRi6jM45oJMU3KziA5/sSG+dXeVaTm661snQP6xu3ykGxxwU5sLG3sh14teO/IOEPVsQMRfA==",
      "cpu": [
        "arm64"
      ],
+      "license": "MIT",
      "optional": true,
      "os": [
        "win32"
@ -1969,12 +2018,13 @@
      }
    },
    "node_modules/@next/swc-win32-x64-msvc": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.3.3.tgz",
-      "integrity": "sha512-4QZG6F8enl9/S2+yIiOiju0iCTFd93d8VC1q9LZS4p/Xuk81W2QDjCFeoogmrWWkAD59z8ZxepBQap2dKS5ruw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.5.3.tgz",
+      "integrity": "sha512-JMoLAq3n3y5tKXPQwCK5c+6tmwkuFDa2XAxz8Wm4+IVthdBZdZGh+lmiLUHg9f9IDwIQpUjp+ysd6OkYTyZRZw==",
      "cpu": [
        "x64"
      ],
+      "license": "MIT",
      "optional": true,
      "os": [
        "win32"
@ -2874,22 +2924,22 @@
      }
    },
    "node_modules/@radix-ui/react-select": {
-      "version": "2.2.5",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz",
-      "integrity": "sha512-HnMTdXEVuuyzx63ME0ut4+sEMYW6oouHWNGUZc7ddvUWIcfCva/AMoqEW/3wnEllriMWBa0RHspCYnfCWJQYmA==",
+      "version": "2.2.6",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.6.tgz",
+      "integrity": "sha512-I30RydO+bnn2PQztvo25tswPH+wFBjehVGtmagkU78yMdwTwVf12wnAOF+AeP8S2N8xD+5UPbGhkUfPyvT+mwQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/number": "1.1.1",
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-collection": "1.1.7",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
        "@radix-ui/react-direction": "1.1.1",
-        "@radix-ui/react-dismissable-layer": "1.1.10",
-        "@radix-ui/react-focus-guards": "1.1.2",
+        "@radix-ui/react-dismissable-layer": "1.1.11",
+        "@radix-ui/react-focus-guards": "1.1.3",
        "@radix-ui/react-focus-scope": "1.1.7",
        "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-popper": "1.2.7",
+        "@radix-ui/react-popper": "1.2.8",
        "@radix-ui/react-portal": "1.1.9",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-slot": "1.2.3",
@ -2916,13 +2966,19 @@
        }
      }
    },
+    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/primitive": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
+      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
+      "license": "MIT"
+    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": {
-      "version": "1.1.10",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz",
-      "integrity": "sha512-IM1zzRV4W3HtVgftdQiiOmA0AdJlCtMLe00FXaHwgt3rAnNsIyDqshvkIW3hj/iu5hu8ERP7KIYki6NkqDxAwQ==",
+      "version": "1.1.11",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
+      "integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
@ -2943,6 +2999,21 @@
        }
      }
    },
+    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-guards": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz",
+      "integrity": "sha512-0rFg/Rj2Q62NCm62jZw0QX7a3sz6QCQU0LpZdNrJX8byRGaGVTqbrW9jAoIAHyMQqsNpeZ81YgSizOt5WXq0Pw==",
+      "license": "MIT",
+      "peerDependencies": {
+        "@types/react": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        }
+      }
+    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-focus-scope": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.1.7.tgz",
@ -2968,38 +3039,6 @@
        }
      }
    },
-    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-popper": {
-      "version": "1.2.7",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.7.tgz",
-      "integrity": "sha512-IUFAccz1JyKcf/RjB552PlWwxjeCJB8/4KxT7EhBHOJM+mN7LdW+B3kacJXILm32xawcMMjb2i0cIZpo+f9kiQ==",
-      "license": "MIT",
-      "dependencies": {
-        "@floating-ui/react-dom": "^2.0.0",
-        "@radix-ui/react-arrow": "1.1.7",
-        "@radix-ui/react-compose-refs": "1.1.2",
-        "@radix-ui/react-context": "1.1.2",
-        "@radix-ui/react-primitive": "2.1.3",
-        "@radix-ui/react-use-callback-ref": "1.1.1",
-        "@radix-ui/react-use-layout-effect": "1.1.1",
-        "@radix-ui/react-use-rect": "1.1.1",
-        "@radix-ui/react-use-size": "1.1.1",
-        "@radix-ui/rect": "1.1.1"
-      },
-      "peerDependencies": {
-        "@types/react": "*",
-        "@types/react-dom": "*",
-        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
-        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
-      },
-      "peerDependenciesMeta": {
-        "@types/react": {
-          "optional": true
-        },
-        "@types/react-dom": {
-          "optional": true
-        }
-      }
-    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-portal": {
      "version": "1.1.9",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
@ -3547,12 +3586,6 @@
        "@sinonjs/commons": "^3.0.0"
      }
    },
-    "node_modules/@swc/counter": {
-      "version": "0.1.3",
-      "resolved": "https://registry.npmjs.org/@swc/counter/-/counter-0.1.3.tgz",
-      "integrity": "sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ==",
-      "license": "Apache-2.0"
-    },
    "node_modules/@swc/helpers": {
      "version": "0.5.15",
      "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.15.tgz",
@ -5475,17 +5508,6 @@
      "dev": true,
      "license": "MIT"
    },
-    "node_modules/busboy": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
-      "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
-      "dependencies": {
-        "streamsearch": "^1.1.0"
-      },
-      "engines": {
-        "node": ">=10.16.0"
-      }
-    },
    "node_modules/bytes": {
      "version": "3.1.2",
      "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz",
@ -8295,9 +8317,9 @@
      }
    },
    "node_modules/is-arrayish": {
-      "version": "0.3.2",
-      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
-      "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==",
+      "version": "0.3.4",
+      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.4.tgz",
+      "integrity": "sha512-m6UrgzFVUYawGBh1dUsWR5M2Clqic9RVXC/9f8ceNlv2IcO9j9J/z8UoCLPqtsPBFNzEpfR3xftohbfqDx8EQA==",
      "license": "MIT",
      "optional": true
    },
@ -10292,9 +10314,9 @@
      "license": "MIT"
    },
    "node_modules/llama-stack-client": {
-      "version": "0.2.21",
-      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.21.tgz",
-      "integrity": "sha512-rjU2Vx5xStxDYavU8K1An/SYXiQQjroLcK98B+p0Paz/a7OgRao2S0YwvThJjPUyChY4fO03UIXP9LpmHqlXWQ==",
+      "version": "0.2.22",
+      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.22.tgz",
+      "integrity": "sha512-7aW3UQj5MwjV73Brd+yQ1e4W1W33nhozyeHM5tzOgbsVZ88tL78JNiNvyFqDR5w6V9XO4/uSGGiQVG6v83yR4w==",
      "license": "MIT",
      "dependencies": {
        "@types/node": "^18.11.18",
@ -11542,14 +11564,13 @@
      }
    },
    "node_modules/next": {
-      "version": "15.3.3",
-      "resolved": "https://registry.npmjs.org/next/-/next-15.3.3.tgz",
-      "integrity": "sha512-JqNj29hHNmCLtNvd090SyRbXJiivQ+58XjCcrC50Crb5g5u2zi7Y2YivbsEfzk6AtVI80akdOQbaMZwWB1Hthw==",
+      "version": "15.5.3",
+      "resolved": "https://registry.npmjs.org/next/-/next-15.5.3.tgz",
+      "integrity": "sha512-r/liNAx16SQj4D+XH/oI1dlpv9tdKJ6cONYPwwcCC46f2NjpaRWY+EKCzULfgQYV6YKXjHBchff2IZBSlZmJNw==",
+      "license": "MIT",
      "dependencies": {
-        "@next/env": "15.3.3",
-        "@swc/counter": "0.1.3",
+        "@next/env": "15.5.3",
        "@swc/helpers": "0.5.15",
-        "busboy": "1.6.0",
        "caniuse-lite": "^1.0.30001579",
        "postcss": "8.4.31",
        "styled-jsx": "5.1.6"
@ -11561,19 +11582,19 @@
        "node": "^18.18.0 || ^19.8.0 || >= 20.0.0"
      },
      "optionalDependencies": {
-        "@next/swc-darwin-arm64": "15.3.3",
-        "@next/swc-darwin-x64": "15.3.3",
-        "@next/swc-linux-arm64-gnu": "15.3.3",
-        "@next/swc-linux-arm64-musl": "15.3.3",
-        "@next/swc-linux-x64-gnu": "15.3.3",
-        "@next/swc-linux-x64-musl": "15.3.3",
-        "@next/swc-win32-arm64-msvc": "15.3.3",
-        "@next/swc-win32-x64-msvc": "15.3.3",
-        "sharp": "^0.34.1"
+        "@next/swc-darwin-arm64": "15.5.3",
+        "@next/swc-darwin-x64": "15.5.3",
+        "@next/swc-linux-arm64-gnu": "15.5.3",
+        "@next/swc-linux-arm64-musl": "15.5.3",
+        "@next/swc-linux-x64-gnu": "15.5.3",
+        "@next/swc-linux-x64-musl": "15.5.3",
+        "@next/swc-win32-arm64-msvc": "15.5.3",
+        "@next/swc-win32-x64-msvc": "15.5.3",
+        "sharp": "^0.34.3"
      },
      "peerDependencies": {
        "@opentelemetry/api": "^1.1.0",
-        "@playwright/test": "^1.41.2",
+        "@playwright/test": "^1.51.1",
        "babel-plugin-react-compiler": "*",
        "react": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0",
        "react-dom": "^18.2.0 || 19.0.0-rc-de68d2f4-20241204 || ^19.0.0",
@ -13240,16 +13261,16 @@
      "license": "ISC"
    },
    "node_modules/sharp": {
-      "version": "0.34.1",
-      "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.1.tgz",
-      "integrity": "sha512-1j0w61+eVxu7DawFJtnfYcvSv6qPFvfTaqzTQ2BLknVhHTwGS8sc63ZBF4rzkWMBVKybo4S5OBtDdZahh2A1xg==",
+      "version": "0.34.3",
+      "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.3.tgz",
+      "integrity": "sha512-eX2IQ6nFohW4DbvHIOLRB3MHFpYqaqvXd3Tp5e/T/dSH83fxaNJQRvDMhASmkNTsNTVF2/OOopzRCt7xokgPfg==",
      "hasInstallScript": true,
      "license": "Apache-2.0",
      "optional": true,
      "dependencies": {
        "color": "^4.2.3",
-        "detect-libc": "^2.0.3",
-        "semver": "^7.7.1"
+        "detect-libc": "^2.0.4",
+        "semver": "^7.7.2"
      },
      "engines": {
        "node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@ -13258,26 +13279,28 @@
        "url": "https://opencollective.com/libvips"
      },
      "optionalDependencies": {
-        "@img/sharp-darwin-arm64": "0.34.1",
-        "@img/sharp-darwin-x64": "0.34.1",
-        "@img/sharp-libvips-darwin-arm64": "1.1.0",
-        "@img/sharp-libvips-darwin-x64": "1.1.0",
-        "@img/sharp-libvips-linux-arm": "1.1.0",
-        "@img/sharp-libvips-linux-arm64": "1.1.0",
-        "@img/sharp-libvips-linux-ppc64": "1.1.0",
-        "@img/sharp-libvips-linux-s390x": "1.1.0",
-        "@img/sharp-libvips-linux-x64": "1.1.0",
-        "@img/sharp-libvips-linuxmusl-arm64": "1.1.0",
-        "@img/sharp-libvips-linuxmusl-x64": "1.1.0",
-        "@img/sharp-linux-arm": "0.34.1",
-        "@img/sharp-linux-arm64": "0.34.1",
-        "@img/sharp-linux-s390x": "0.34.1",
-        "@img/sharp-linux-x64": "0.34.1",
-        "@img/sharp-linuxmusl-arm64": "0.34.1",
-        "@img/sharp-linuxmusl-x64": "0.34.1",
-        "@img/sharp-wasm32": "0.34.1",
-        "@img/sharp-win32-ia32": "0.34.1",
-        "@img/sharp-win32-x64": "0.34.1"
+        "@img/sharp-darwin-arm64": "0.34.3",
+        "@img/sharp-darwin-x64": "0.34.3",
+        "@img/sharp-libvips-darwin-arm64": "1.2.0",
+        "@img/sharp-libvips-darwin-x64": "1.2.0",
+        "@img/sharp-libvips-linux-arm": "1.2.0",
+        "@img/sharp-libvips-linux-arm64": "1.2.0",
+        "@img/sharp-libvips-linux-ppc64": "1.2.0",
+        "@img/sharp-libvips-linux-s390x": "1.2.0",
+        "@img/sharp-libvips-linux-x64": "1.2.0",
+        "@img/sharp-libvips-linuxmusl-arm64": "1.2.0",
+        "@img/sharp-libvips-linuxmusl-x64": "1.2.0",
+        "@img/sharp-linux-arm": "0.34.3",
+        "@img/sharp-linux-arm64": "0.34.3",
+        "@img/sharp-linux-ppc64": "0.34.3",
+        "@img/sharp-linux-s390x": "0.34.3",
+        "@img/sharp-linux-x64": "0.34.3",
+        "@img/sharp-linuxmusl-arm64": "0.34.3",
+        "@img/sharp-linuxmusl-x64": "0.34.3",
+        "@img/sharp-wasm32": "0.34.3",
+        "@img/sharp-win32-arm64": "0.34.3",
+        "@img/sharp-win32-ia32": "0.34.3",
+        "@img/sharp-win32-x64": "0.34.3"
      }
    },
    "node_modules/shebang-command": {
@ -13403,9 +13426,9 @@
      "license": "ISC"
    },
    "node_modules/simple-swizzle": {
-      "version": "0.2.2",
-      "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
-      "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==",
+      "version": "0.2.4",
+      "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.4.tgz",
+      "integrity": "sha512-nAu1WFPQSMNr2Zn9PGSZK9AGn4t/y97lEm+MXTtUDwfP0ksAIX4nO+6ruD9Jwut4C49SB1Ws+fbXsm/yScWOHw==",
      "license": "MIT",
      "optional": true,
      "dependencies": {
@ -13526,14 +13549,6 @@
        "node": ">= 0.8"
      }
    },
-    "node_modules/streamsearch": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
-      "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
-      "engines": {
-        "node": ">=10.0.0"
-      }
-    },
    "node_modules/string-length": {
      "version": "4.0.2",
      "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz",
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -16,16 +16,16 @@
    "@radix-ui/react-collapsible": "^1.1.12",
    "@radix-ui/react-dialog": "^1.1.13",
    "@radix-ui/react-dropdown-menu": "^2.1.16",
-    "@radix-ui/react-select": "^2.2.5",
+    "@radix-ui/react-select": "^2.2.6",
    "@radix-ui/react-separator": "^1.1.7",
    "@radix-ui/react-slot": "^1.2.3",
    "@radix-ui/react-tooltip": "^1.2.8",
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
    "framer-motion": "^12.23.12",
-    "llama-stack-client": "^0.2.21",
+    "llama-stack-client": "^0.2.22",
    "lucide-react": "^0.542.0",
-    "next": "15.3.3",
+    "next": "15.5.3",
    "next-auth": "^4.24.11",
    "next-themes": "^0.4.6",
    "react": "^19.0.0",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,7 +7,7 @@ required-version = ">=0.7.0"

 [project]
 name = "llama_stack"
-version = "0.2.21"
+version = "0.2.22"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@ -31,12 +31,12 @@ dependencies = [
    "huggingface-hub>=0.34.0,<1.0",
    "jinja2>=3.1.6",
    "jsonschema",
-    "llama-stack-client>=0.2.21",
+    "llama-stack-client>=0.2.22",
    "openai>=1.100.0",                                # for expires_after support
    "prompt-toolkit",
    "python-dotenv",
    "python-jose[cryptography]",
-    "pydantic>=2",
+    "pydantic>=2.11.9",
    "rich",
    "starlette",
    "termcolor",
@ -55,7 +55,7 @@ dependencies = [
 ui = [
    "streamlit",
    "pandas",
-    "llama-stack-client>=0.2.21",
+    "llama-stack-client>=0.2.22",
    "streamlit-option-menu",
 ]

@ -141,7 +141,7 @@ docs = [
    "sphinxcontrib.openapi",
    "requests",
 ]
-codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
+codegen = ["rich", "pydantic>=2.11.9", "jinja2>=3.1.6"]
 benchmark = [
    "locust>=2.39.1",
 ]
@ -354,6 +354,7 @@ warn_required_dynamic_aliases = true
 classmethod-decorators = ["classmethod", "pydantic.field_validator"]

 [tool.pytest.ini_options]
+addopts = ["--durations=10"]
 asyncio_mode = "auto"
 markers = [
    "allow_network: Allow network access for specific unit tests",
--- a/scripts/github/schedule-record-workflow.sh
+++ b/scripts/github/schedule-record-workflow.sh
@ -13,6 +13,7 @@ set -euo pipefail

 # Default values
 BRANCH=""
+PR_HEAD_REPO=""
 TEST_SUBDIRS=""
 TEST_SETUP="ollama"
 TEST_SUITE="base"
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -92,11 +92,11 @@ Options:
    -h, --help                 Show this help message

 For more information:
-    Documentation: https://llama-stack.readthedocs.io/
-    GitHub: https://github.com/meta-llama/llama-stack
+    Documentation: https://llamastack.github.io/latest/
+    GitHub: https://github.com/llamastack/llama-stack

 Report issues:
-    https://github.com/meta-llama/llama-stack/issues
+    https://github.com/llamastack/llama-stack/issues
 EOF
 }

@ -241,8 +241,8 @@ fi
 log ""
 log "🎉 Llama Stack is ready!"
 log "👉  API endpoint: http://localhost:${PORT}"
-log "📖 Documentation: https://llama-stack.readthedocs.io/en/latest/references/index.html"
+log "📖 Documentation: https://llamastack.github.io/latest/references/api_reference/index.html"
 log "💻 To access the llama stack CLI, exec into the container:"
 log "   $ENGINE exec -ti llama-stack bash"
-log "🐛 Report an issue @ https://github.com/meta-llama/llama-stack/issues if you think it's a bug"
+log "🐛 Report an issue @ https://github.com/llamastack/llama-stack/issues if you think it's a bug"
 log ""
--- a/tests/external/kaze.yaml
+++ b/tests/external/kaze.yaml
@ -1,4 +1,3 @@
-adapter:
 adapter_type: kaze
 pip_packages: ["tests/external/llama-stack-provider-kaze"]
 config_class: llama_stack_provider_kaze.config.KazeProviderConfig
--- a/tests/external/llama-stack-api-weather/src/llama_stack_api_weather/weather.py
+++ b/tests/external/llama-stack-api-weather/src/llama_stack_api_weather/weather.py
@ -6,7 +6,7 @@

 from typing import Protocol

-from llama_stack.providers.datatypes import AdapterSpec, Api, ProviderSpec, RemoteProviderSpec
+from llama_stack.providers.datatypes import Api, ProviderSpec, RemoteProviderSpec
 from llama_stack.schema_utils import webmethod


@ -16,12 +16,9 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.weather,
            provider_type="remote::kaze",
            config_class="llama_stack_provider_kaze.KazeProviderConfig",
-            adapter=AdapterSpec(
            adapter_type="kaze",
            module="llama_stack_provider_kaze",
            pip_packages=["llama_stack_provider_kaze"],
-                config_class="llama_stack_provider_kaze.KazeProviderConfig",
-            ),
        ),
    ]

--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -48,7 +48,6 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "remote::nvidia",
        "remote::runpod",
        "remote::sambanova",
-        "remote::tgi",
        "remote::vertexai",
        # {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos,
        # or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}
@ -59,6 +58,7 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        #  does not work with the specified model, gpt-5-mini. Please choose different model and try
        #  again. You can learn more about which models can be used with each operation here:
        #  https://go.microsoft.com/fwlink/?linkid=2197993.'}}"}
+        "remote::watsonx",  # return 404 when hitting the /openai/v1 endpoint
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")

@ -96,6 +96,8 @@ def skip_if_doesnt_support_n(client_with_models, model_id):
        "remote::vertexai",
        #  Error code: 400 - [{'error': {'code': 400, 'message': 'Unable to submit request because candidateCount must be 1 but
        #  the entered value was 2. Update the candidateCount value and try again.', 'status': 'INVALID_ARGUMENT'}
+        "remote::tgi",  # TGI ignores n param silently
+        "remote::together",  # `n` > 1 is not supported when streaming tokens. Please disable `stream`
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")

@ -110,7 +112,7 @@ def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, mode
        "remote::cerebras",
        "remote::databricks",
        "remote::runpod",
-        "remote::tgi",
+        "remote::watsonx",  # watsonx returns 404 when hitting the /openai/v1 endpoint
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")

--- a/tests/integration/inference/test_openai_embeddings.py
+++ b/tests/integration/inference/test_openai_embeddings.py
@ -29,9 +29,35 @@ def provider_from_model(client_with_models, model_id):
    return providers[provider_id]


-def skip_if_model_doesnt_support_variable_dimensions(model_id):
-    if "text-embedding-3" not in model_id:
-        pytest.skip("{model_id} does not support variable output embedding dimensions")
+def skip_if_model_doesnt_support_user_param(client, model_id):
+    provider = provider_from_model(client, model_id)
+    if provider.provider_type in (
+        "remote::together",  # service returns 400
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support user param.")
+
+
+def skip_if_model_doesnt_support_encoding_format_base64(client, model_id):
+    provider = provider_from_model(client, model_id)
+    if provider.provider_type in (
+        "remote::together",  # param silently ignored, always returns floats
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} does not support encoding_format='base64'.")
+
+
+def skip_if_model_doesnt_support_variable_dimensions(client_with_models, model_id):
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type in (
+        "remote::together",  # returns 400
+        "inline::sentence-transformers",
+    ):
+        pytest.skip(
+            f"Model {model_id} hosted by {provider.provider_type} does not support variable output embedding dimensions."
+        )
+    if provider.provider_type == "remote::openai" and "text-embedding-3" not in model_id:
+        pytest.skip(
+            f"Model {model_id} hosted by {provider.provider_type} does not support variable output embedding dimensions."
+        )


@pytest.fixture(params=["openai_client", "llama_stack_client"])
@ -92,6 +118,7 @@ def test_openai_embeddings_multiple_strings(compat_client, client_with_models, e
    response = compat_client.embeddings.create(
        model=embedding_model_id,
        input=input_texts,
+        encoding_format="float",
    )

    assert response.object == "list"
@ -127,7 +154,7 @@ def test_openai_embeddings_with_encoding_format_float(compat_client, client_with
 def test_openai_embeddings_with_dimensions(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with custom dimensions parameter."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
-    skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
+    skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)

    input_text = "Test dimensions parameter"
    dimensions = 16
@ -148,6 +175,7 @@ def test_openai_embeddings_with_dimensions(compat_client, client_with_models, em
 def test_openai_embeddings_with_user_parameter(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with user parameter."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_user_param(client_with_models, embedding_model_id)

    input_text = "Test user parameter"
    user_id = "test-user-123"
@ -196,11 +224,13 @@ def test_openai_embeddings_different_inputs_different_outputs(compat_client, cli
    response1 = compat_client.embeddings.create(
        model=embedding_model_id,
        input=input_text1,
+        encoding_format="float",
    )

    response2 = compat_client.embeddings.create(
        model=embedding_model_id,
        input=input_text2,
+        encoding_format="float",
    )

    embedding1 = response1.data[0].embedding
@ -214,7 +244,8 @@ def test_openai_embeddings_different_inputs_different_outputs(compat_client, cli
 def test_openai_embeddings_with_encoding_format_base64(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with base64 encoding format."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
-    skip_if_model_doesnt_support_variable_dimensions(embedding_model_id)
+    skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_variable_dimensions(client_with_models, embedding_model_id)

    input_text = "Test base64 encoding format"
    dimensions = 12
@ -247,6 +278,7 @@ def test_openai_embeddings_with_encoding_format_base64(compat_client, client_wit
 def test_openai_embeddings_base64_batch_processing(compat_client, client_with_models, embedding_model_id):
    """Test OpenAI embeddings endpoint with base64 encoding for batch processing."""
    skip_if_model_doesnt_support_openai_embeddings(client_with_models, embedding_model_id)
+    skip_if_model_doesnt_support_encoding_format_base64(client_with_models, embedding_model_id)

    input_texts = ["First text for base64", "Second text for base64", "Third text for base64"]

--- a/tests/integration/inference/test_text_inference.py
+++ b/tests/integration/inference/test_text_inference.py
@ -45,7 +45,7 @@ def skip_if_model_doesnt_support_json_schema_structured_output(client_with_model
    provider_id = models[model_id].provider_id
    providers = {p.provider_id: p for p in client_with_models.providers.list()}
    provider = providers[provider_id]
-    if provider.provider_type in ("remote::sambanova", "remote::azure"):
+    if provider.provider_type in ("remote::sambanova", "remote::azure", "remote::watsonx"):
        pytest.skip(
            f"Model {model_id} hosted by {provider.provider_type} doesn't support json_schema structured output"
        )
@ -211,6 +211,7 @@ def test_text_completion_log_probs_streaming(client_with_models, text_model_id,
 )
 def test_text_completion_structured_output(client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_completion(client_with_models, text_model_id)
+    skip_if_model_doesnt_support_json_schema_structured_output(client_with_models, text_model_id)

    class AnswerFormat(BaseModel):
        name: str
--- a/tests/integration/recordings/responses/07c5fa34d9ca.json
+++ b/tests/integration/recordings/responses/07c5fa34d9ca.json
@ -0,0 +1,800 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/embeddings",
+    "headers": {},
+    "body": {
+      "model": "togethercomputer/m2-bert-80M-32k-retrieval",
+      "input": "Test encoding format"
+    },
+    "endpoint": "/v1/embeddings",
+    "model": "togethercomputer/m2-bert-80M-32k-retrieval"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
+      "__data__": {
+        "data": [
+          {
+            "embedding": [
+              -0.011256923,
+              0.0037174695,
+              0.047607094,
+              -0.03605117,
+              0.022678856,
+              0.0022196341,
+              0.008172763,
+              -0.07876377,
+              -0.012652523,
+              -0.124776885,
+              -0.07201225,
+              0.011470616,
+              0.020233244,
+              -0.03953407,
+              0.017867543,
+              -0.07615726,
+              0.015161683,
+              0.01493531,
+              0.0021282644,
+              0.02805457,
+              0.0008320583,
+              0.022922216,
+              0.049158294,
+              -0.03197842,
+              0.020910429,
+              0.03798574,
+              0.032469492,
+              0.009267314,
+              0.0883011,
+              0.0032435523,
+              0.013633923,
+              0.0457091,
+              -0.022143621,
+              -0.0007423012,
+              -0.03613117,
+              0.052107,
+              0.02962152,
+              0.045084383,
+              0.044733327,
+              0.11753868,
+              0.05730107,
+              0.026509244,
+              -0.056454167,
+              -0.017637681,
+              0.030301955,
+              0.04790331,
+              -0.025398305,
+              -0.019705286,
+              0.11366949,
+              0.05800383,
+              -0.0072742635,
+              0.100181706,
+              0.1609472,
+              0.0053162435,
+              0.01714287,
+              -0.023215268,
+              0.042824704,
+              0.04082185,
+              0.030668061,
+              -0.06529372,
+              0.008288249,
+              0.0325246,
+              0.009664108,
+              -0.031153189,
+              0.044064675,
+              0.10059426,
+              0.036557477,
+              0.009674479,
+              0.016028037,
+              0.02236809,
+              0.056538712,
+              -0.12828006,
+              0.016760435,
+              0.015355689,
+              -0.00070172164,
+              -0.0076741586,
+              -0.02880062,
+              -0.011680436,
+              -0.036522433,
+              -0.030315956,
+              0.023295958,
+              0.031333964,
+              0.042397793,
+              -0.063102156,
+              0.0669075,
+              -0.07678097,
+              0.0616129,
+              -0.0071245604,
+              -0.021313114,
+              0.0040440215,
+              0.04436404,
+              0.05289292,
+              0.05803014,
+              0.032691576,
+              0.037537806,
+              -0.09712317,
+              -0.0061692744,
+              0.008186577,
+              -0.0151672475,
+              -0.05499382,
+              -0.11011894,
+              -0.017255861,
+              0.061501417,
+              0.03551128,
+              0.056205165,
+              0.07500363,
+              0.023062926,
+              0.10787879,
+              0.063290246,
+              -0.021196125,
+              -0.005724647,
+              0.019805718,
+              -0.0063712946,
+              -0.049270064,
+              -0.024442751,
+              0.018587058,
+              -0.082689136,
+              -0.019034613,
+              0.005483609,
+              0.03418548,
+              -0.008317338,
+              0.06888298,
+              -0.037655607,
+              -0.05362105,
+              -0.010807861,
+              0.069666155,
+              -0.01777964,
+              -0.015136251,
+              -0.026567455,
+              -0.08084807,
+              -0.078372054,
+              0.039493512,
+              0.013156698,
+              0.07340631,
+              0.12035369,
+              -0.05765069,
+              0.025966862,
+              -0.0045753582,
+              -0.030865112,
+              0.039448086,
+              -0.037273232,
+              0.047059145,
+              -0.029127738,
+              -0.024217308,
+              0.02748501,
+              -0.048555836,
+              0.017913114,
+              -0.055981673,
+              -0.005601368,
+              -0.04045025,
+              -0.017308103,
+              0.06272273,
+              0.012256746,
+              0.01575095,
+              -0.026737463,
+              0.04115108,
+              0.07562276,
+              -0.01140116,
+              0.022552952,
+              0.0443809,
+              -0.030472409,
+              -0.021670958,
+              -0.037897367,
+              0.017250286,
+              -0.033001736,
+              -0.048738975,
+              -0.06429833,
+              -0.015412785,
+              0.0036735258,
+              0.023700202,
+              0.035861194,
+              -0.05393875,
+              0.048050668,
+              0.032297045,
+              0.021352977,
+              -0.05701748,
+              0.0008330949,
+              -0.006661303,
+              -0.0070953164,
+              -0.043984424,
+              0.052504774,
+              0.027689766,
+              0.031661708,
+              -0.050054867,
+              -0.015419155,
+              -0.013700429,
+              -0.03579233,
+              -0.08926211,
+              -0.034341693,
+              -0.01738188,
+              -0.0065487004,
+              -0.051955026,
+              0.0019674778,
+              0.0015172043,
+              0.024915336,
+              0.010987228,
+              0.061529815,
+              0.09077649,
+              0.04394813,
+              -0.07503514,
+              0.043345768,
+              -0.028357483,
+              0.06312762,
+              0.025069924,
+              0.028561853,
+              0.043048594,
+              0.017411513,
+              -0.025240859,
+              -0.0056393985,
+              0.054039005,
+              0.008721963,
+              -0.039967448,
+              0.0012871448,
+              0.0052062417,
+              0.005563228,
+              0.042596456,
+              -0.008794862,
+              -0.044669237,
+              0.04184779,
+              0.008726271,
+              0.10136058,
+              0.040724736,
+              0.14168875,
+              -0.017516509,
+              -0.11203568,
+              0.0010548063,
+              -0.058536656,
+              0.01673066,
+              0.007502946,
+              -0.035662595,
+              0.034719367,
+              -0.0060368567,
+              0.13295838,
+              0.026423598,
+              0.056147255,
+              0.04473965,
+              0.045232397,
+              0.07171366,
+              0.009358642,
+              -0.021109166,
+              0.033915937,
+              0.0380073,
+              -0.01451498,
+              -0.021589639,
+              0.062518574,
+              -0.017531183,
+              -0.030811403,
+              0.024500312,
+              0.05383414,
+              -0.1335839,
+              0.01834579,
+              -0.051048376,
+              0.07460228,
+              0.03231806,
+              0.00962887,
+              0.05156732,
+              0.016169788,
+              0.0062234807,
+              -0.09062714,
+              -0.08959952,
+              0.025153147,
+              -0.030351512,
+              -0.04339584,
+              0.007234872,
+              0.014588551,
+              0.022614833,
+              -0.08844599,
+              -0.009002514,
+              -0.114522785,
+              0.08118862,
+              -0.03023919,
+              0.007820294,
+              0.043863248,
+              -0.043678157,
+              -0.036323708,
+              0.006777855,
+              -0.019326974,
+              -0.0664114,
+              -0.019019991,
+              0.073445216,
+              -0.039277073,
+              -0.0157583,
+              -0.01931436,
+              -0.027121417,
+              -0.028259363,
+              -0.107222356,
+              0.11150329,
+              -0.012612926,
+              -0.025338905,
+              0.029330198,
+              0.011753977,
+              0.009784897,
+              0.042475123,
+              -0.004051051,
+              -0.014803267,
+              -0.04530689,
+              -0.01848677,
+              -0.050840423,
+              0.01814009,
+              0.0051442874,
+              -0.033988528,
+              0.0033705293,
+              -0.05515113,
+              -0.023601055,
+              -0.06183089,
+              0.012501645,
+              -0.08027637,
+              0.022573682,
+              0.079796925,
+              -0.00926268,
+              -0.02180816,
+              0.0059841494,
+              -0.018863965,
+              -0.011257763,
+              0.055679787,
+              -0.018714463,
+              -0.04081558,
+              -0.017017504,
+              0.026006198,
+              -0.03687599,
+              -0.05399378,
+              0.042955294,
+              0.00079697353,
+              -0.0015601065,
+              0.026138263,
+              -0.01198548,
+              0.07594801,
+              -0.0049053924,
+              -0.001241132,
+              0.022863775,
+              0.025632044,
+              -0.023908222,
+              -0.02252925,
+              0.042020634,
+              -0.060588334,
+              0.05498828,
+              -0.03466166,
+              0.003202133,
+              -0.015508297,
+              -0.021138275,
+              0.007791096,
+              0.052594397,
+              -0.08649948,
+              0.038542755,
+              0.011088168,
+              0.049710445,
+              -0.015898548,
+              0.013559725,
+              -0.0012927915,
+              -0.078937665,
+              -0.0470789,
+              0.02421941,
+              0.0050838543,
+              -0.051634457,
+              0.014016644,
+              0.059073824,
+              -0.01279741,
+              0.006315097,
+              0.028651753,
+              -0.023221422,
+              -0.049021006,
+              -0.08123552,
+              -0.027243393,
+              -0.026543872,
+              0.040068373,
+              0.01465917,
+              0.01366034,
+              -0.07191417,
+              -0.007906117,
+              -0.06743931,
+              -0.040284913,
+              0.046346053,
+              -0.015108051,
+              -0.067285545,
+              0.020757562,
+              -0.03144588,
+              -0.02684228,
+              -0.030008601,
+              0.0008360872,
+              -0.012667347,
+              -0.0782403,
+              0.02436115,
+              -0.054881096,
+              -0.010856299,
+              -0.07653927,
+              -0.044655506,
+              -0.02075821,
+              0.023765713,
+              0.0083463555,
+              0.026002545,
+              -0.003060633,
+              0.060491852,
+              0.032562606,
+              0.029937308,
+              -0.022013078,
+              0.07388013,
+              0.017152807,
+              -0.07095613,
+              -0.03923808,
+              0.0017680842,
+              0.0038672008,
+              -0.053012144,
+              -0.016951663,
+              0.027642388,
+              0.016483316,
+              -0.015618807,
+              -0.11136081,
+              0.006826955,
+              -0.010586094,
+              -0.05052998,
+              -0.04226535,
+              -0.031801827,
+              -0.020531418,
+              -0.06278464,
+              -0.062224947,
+              0.0769673,
+              -0.0706861,
+              0.026174366,
+              -0.041260213,
+              0.058052614,
+              -0.046227556,
+              -0.05443509,
+              0.007650712,
+              -0.061986744,
+              -0.00546975,
+              -0.042977307,
+              -0.0147894155,
+              0.045748055,
+              -0.01602859,
+              0.018538997,
+              0.073324144,
+              -0.105757244,
+              -0.010215157,
+              0.0069961487,
+              -0.010474333,
+              0.007267861,
+              -0.043416463,
+              0.04171331,
+              0.012246647,
+              -0.024870023,
+              0.0067938967,
+              0.023995718,
+              0.037606664,
+              -0.034879085,
+              0.107255146,
+              0.019311333,
+              0.008084773,
+              0.015113109,
+              0.04807634,
+              -0.011898967,
+              0.0028230203,
+              0.004201883,
+              -0.019952193,
+              -0.083809994,
+              0.025964422,
+              0.010652608,
+              0.021981532,
+              -0.029947964,
+              0.10096241,
+              -0.0018155909,
+              -0.078443065,
+              0.035357803,
+              0.030101022,
+              0.08652985,
+              -0.020698488,
+              0.06619985,
+              0.011043828,
+              0.022531942,
+              0.059432585,
+              -0.08669654,
+              0.023926888,
+              0.006353244,
+              -0.046637908,
+              -0.072916985,
+              -0.04355625,
+              -0.010734682,
+              -0.06298886,
+              0.11202974,
+              -0.008399903,
+              0.04045217,
+              -0.049840588,
+              -0.051897135,
+              0.04921834,
+              0.018730633,
+              0.07189677,
+              -0.020521715,
+              0.10433443,
+              -0.0035553537,
+              0.015335822,
+              -0.03326729,
+              -0.05246277,
+              -0.038786076,
+              0.04000599,
+              -0.028919725,
+              -0.017996594,
+              -0.007428113,
+              -0.003258321,
+              0.0127034895,
+              -0.0062633064,
+              0.0007574967,
+              -0.060385525,
+              -0.018971093,
+              0.062526286,
+              -0.025764955,
+              0.05286283,
+              0.043842334,
+              0.044092383,
+              -0.037126385,
+              -0.018775577,
+              0.007996275,
+              -0.00028039515,
+              -0.06591952,
+              0.039109394,
+              0.022268493,
+              0.033030964,
+              0.010780152,
+              0.051087722,
+              -0.07398754,
+              0.02156791,
+              -0.03391487,
+              0.01900175,
+              -0.03438655,
+              -0.050286565,
+              -0.029407075,
+              0.013486627,
+              0.006069821,
+              0.03566702,
+              -0.046612754,
+              0.030740444,
+              -0.0637836,
+              0.020758858,
+              0.013579259,
+              0.015677635,
+              0.07067559,
+              -0.03354964,
+              -0.09833861,
+              -0.045598283,
+              0.046094477,
+              -0.018735003,
+              0.0013117951,
+              0.020225674,
+              -0.025771514,
+              -0.011772435,
+              0.020403381,
+              0.048393097,
+              -0.001137191,
+              -0.008214463,
+              -0.024194324,
+              0.012559411,
+              0.028170707,
+              -0.038262583,
+              -0.010594243,
+              0.008866333,
+              0.02652175,
+              0.010765866,
+              0.02152175,
+              0.007194773,
+              -0.021046689,
+              -0.047594506,
+              -0.05342931,
+              0.044459403,
+              -0.00075621146,
+              0.021768885,
+              0.061362576,
+              0.03243972,
+              0.023200674,
+              0.012056035,
+              -0.010374278,
+              -0.06796502,
+              -0.0056832493,
+              0.048799623,
+              -0.035878677,
+              -0.020508701,
+              0.03527651,
+              0.096402384,
+              -0.027735645,
+              0.11728837,
+              0.022490505,
+              -0.08394513,
+              -0.010033967,
+              0.024851669,
+              -0.019062884,
+              0.00039440763,
+              -0.10133529,
+              0.011722217,
+              -0.04434193,
+              -0.030069547,
+              0.030103652,
+              -0.017366616,
+              0.046203658,
+              -0.04393208,
+              -0.05095759,
+              -0.04554081,
+              -0.029142734,
+              0.01689045,
+              0.008356038,
+              -0.035321265,
+              -0.02382173,
+              -0.0015672153,
+              0.06304823,
+              -0.008137697,
+              -0.014463008,
+              0.045292154,
+              -0.06497864,
+              0.015265712,
+              0.008239593,
+              -0.08195689,
+              0.037012544,
+              0.04680898,
+              0.007484248,
+              0.02335733,
+              -0.06787198,
+              -0.062197443,
+              -0.06841327,
+              -0.039720036,
+              -0.0105394935,
+              -0.057220835,
+              -0.039479975,
+              0.029730098,
+              0.0697698,
+              0.0280752,
+              0.0137115335,
+              -0.0045632124,
+              -0.01313052,
+              0.07553262,
+              -0.04117193,
+              -0.14872926,
+              0.028015105,
+              -0.047134113,
+              -0.016151398,
+              -0.081647106,
+              -0.02221662,
+              -0.036281105,
+              -0.023036504,
+              0.0612415,
+              -0.018361837,
+              -0.0238258,
+              -0.0022532772,
+              0.1537845,
+              0.006872191,
+              -0.044352733,
+              -0.0026320857,
+              -0.08600976,
+              0.005572628,
+              0.053448226,
+              -0.015072955,
+              -0.029777542,
+              -0.019132927,
+              0.053970527,
+              0.005238485,
+              -0.02418231,
+              -0.12369688,
+              0.0014781327,
+              0.059662092,
+              -0.011181213,
+              0.01400666,
+              0.023866476,
+              -0.059490796,
+              -0.054530527,
+              -0.011234197,
+              0.013823349,
+              -0.012150345,
+              -0.09948839,
+              0.023659766,
+              0.014326883,
+              -0.02229736,
+              -0.0024076505,
+              -0.10091382,
+              0.08174192,
+              -0.024408998,
+              -0.023222951,
+              0.011201234,
+              0.013236311,
+              0.04317295,
+              0.051764306,
+              0.07648576,
+              -0.00061111146,
+              -0.088623054,
+              -0.037177067,
+              0.038964123,
+              -0.029959839,
+              0.033466227,
+              -0.08635276,
+              0.04128183,
+              -0.020397836,
+              0.056285754,
+              -0.02570748,
+              0.05911732,
+              0.0061064134,
+              -0.01733281,
+              -0.0875996,
+              -0.0127257295,
+              -0.013593507,
+              -0.04925175,
+              0.01888016,
+              -0.032455195,
+              -0.023753202,
+              0.052025676,
+              0.06000905,
+              0.04137704,
+              0.004952635,
+              -0.02542677,
+              0.00017748028,
+              -0.041987997,
+              0.04760188,
+              0.068178274,
+              -0.060950078,
+              -0.05742421,
+              0.054274186,
+              -0.048096504,
+              0.034568857,
+              0.0012921172,
+              0.0705816,
+              -0.014679933,
+              -0.001761971,
+              -0.029119784,
+              0.008006632,
+              0.018063113,
+              -0.05880496,
+              -0.052486468,
+              0.010976936,
+              0.03688557,
+              0.061141517,
+              -0.009467033,
+              -0.035062946,
+              -0.06794524,
+              -0.0609979,
+              0.015924038,
+              -0.03805085,
+              0.03977454,
+              -0.015656536,
+              0.014254484,
+              -0.030620195,
+              -0.038830906,
+              -0.013730216,
+              -0.070247106,
+              -0.074514836,
+              0.037831023,
+              0.027780455,
+              0.0073002693,
+              -0.050368425,
+              0.040389538,
+              0.035920046,
+              0.025425838,
+              0.006255748,
+              -0.017454483,
+              -0.02307413,
+              0.05788845,
+              0.018672187,
+              0.033335716,
+              0.01855402,
+              0.07957198,
+              -0.0029801806,
+              -0.057038378,
+              0.010123766,
+              0.038190138,
+              0.0333764,
+              0.075057626,
+              0.00592374,
+              0.06380629,
+              -0.028154025,
+              0.07188246,
+              -0.056649268,
+              -0.019166004,
+              0.053392358,
+              0.13961181,
+              -0.08459373,
+              0.03255955
+            ],
+            "index": 0,
+            "object": "embedding"
+          }
+        ],
+        "model": "togethercomputer/m2-bert-80M-32k-retrieval",
+        "object": "list",
+        "usage": null
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/0c1f45455d3b.json
+++ b/tests/integration/recordings/responses/0c1f45455d3b.json
@ -0,0 +1,59 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, world!"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "oBUtgGr-4Yz4kd-9801a2f00b2b42e8",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": []
+            },
+            "seed": 1098425109146507500
+          }
+        ],
+        "created": 1758039052,
+        "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 25,
+          "prompt_tokens": 39,
+          "total_tokens": 64,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null,
+          "cached_tokens": 0
+        },
+        "prompt": []
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/17030e75309f.json
+++ b/tests/integration/recordings/responses/17030e75309f.json
@ -0,0 +1,800 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/embeddings",
+    "headers": {},
+    "body": {
+      "model": "togethercomputer/m2-bert-80M-32k-retrieval",
+      "input": "This is completely different content"
+    },
+    "endpoint": "/v1/embeddings",
+    "model": "togethercomputer/m2-bert-80M-32k-retrieval"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
+      "__data__": {
+        "data": [
+          {
+            "embedding": [
+              0.020581583,
+              0.03996682,
+              0.06342483,
+              -0.046694994,
+              -0.07684763,
+              -0.05265455,
+              -0.053058416,
+              -0.008007386,
+              -0.04512141,
+              0.03718547,
+              -0.026790882,
+              0.039592147,
+              0.08868821,
+              -0.054975007,
+              0.022950895,
+              -0.03249339,
+              0.05376096,
+              0.04878751,
+              0.06144113,
+              0.08925032,
+              -0.06345507,
+              -0.0008829904,
+              0.07914291,
+              -0.028592229,
+              -0.048433058,
+              -0.0351529,
+              0.028880889,
+              -0.08001268,
+              -0.04552556,
+              -0.080687605,
+              0.1400234,
+              0.14326853,
+              0.02891313,
+              -0.05588759,
+              0.007262874,
+              0.026984219,
+              0.09121335,
+              0.050748702,
+              0.017702162,
+              -0.035733465,
+              0.1328057,
+              -0.08973662,
+              -0.050988093,
+              -0.009071953,
+              0.00674055,
+              0.0138731655,
+              -0.024637444,
+              -0.0019375099,
+              0.019351467,
+              0.041681487,
+              0.09368255,
+              0.0052818935,
+              0.027539922,
+              -0.031472813,
+              0.042352878,
+              0.07326235,
+              0.010973438,
+              0.06776053,
+              0.06473745,
+              0.031266563,
+              0.00057834754,
+              -0.002110916,
+              0.16004054,
+              -0.0535361,
+              0.04453045,
+              0.050499436,
+              0.03501775,
+              -0.003733677,
+              0.020598825,
+              -0.079224035,
+              0.07070447,
+              -0.060201976,
+              0.006393084,
+              -0.003781692,
+              0.070510566,
+              -0.047214407,
+              0.06080987,
+              -0.0877733,
+              -0.08569845,
+              -0.018021964,
+              0.06378409,
+              0.027565937,
+              0.038700324,
+              -0.1248613,
+              0.00903349,
+              -0.08429076,
+              0.016536232,
+              0.025240825,
+              0.00043874417,
+              -0.004602262,
+              0.0457946,
+              -0.03598806,
+              0.056914188,
+              0.044693712,
+              0.011178773,
+              -0.020428436,
+              0.036093723,
+              0.031189999,
+              0.07220326,
+              -0.066868156,
+              -0.020061923,
+              -0.0563857,
+              -0.013928966,
+              -0.034524415,
+              0.0041604545,
+              -0.047119446,
+              0.033624567,
+              0.06970587,
+              -0.033320673,
+              -0.0413748,
+              0.01094969,
+              -0.0100499755,
+              0.004480598,
+              0.02067311,
+              -0.021157527,
+              0.022485765,
+              0.03633523,
+              0.0049809627,
+              0.02181411,
+              0.049156368,
+              0.06253565,
+              0.059981186,
+              -0.031591866,
+              -0.049331754,
+              0.033537455,
+              0.021542493,
+              0.009435254,
+              0.025516914,
+              0.025417773,
+              -0.07066102,
+              0.011794456,
+              0.06311989,
+              0.011093616,
+              0.08549021,
+              -0.04281618,
+              0.011115061,
+              0.07443118,
+              0.021961706,
+              -0.02724888,
+              -0.00047235374,
+              0.016601468,
+              0.043411057,
+              0.03835865,
+              0.01029931,
+              0.008437206,
+              -0.057274926,
+              -0.045377273,
+              -0.09733081,
+              -0.009755395,
+              0.028172465,
+              0.043972567,
+              0.0968819,
+              0.052496422,
+              0.031553026,
+              -0.019291716,
+              0.034150966,
+              0.1310106,
+              0.02864821,
+              -0.047452684,
+              0.016342362,
+              -0.06591784,
+              -0.064888336,
+              -0.03380424,
+              -0.08384223,
+              0.023302404,
+              -0.020427782,
+              0.019540966,
+              0.02240307,
+              0.026848866,
+              -0.0018868797,
+              -0.031800512,
+              -0.073483676,
+              0.08840526,
+              -0.02696041,
+              -0.042041607,
+              0.030633071,
+              0.020918656,
+              0.06119309,
+              -0.048348967,
+              0.036555305,
+              0.033583682,
+              0.019630525,
+              -0.03500669,
+              -0.020821452,
+              0.012256841,
+              0.06733756,
+              0.036884613,
+              -0.080063485,
+              0.019956889,
+              -0.01994667,
+              0.0011630546,
+              -0.08307688,
+              -0.040326167,
+              -0.03293244,
+              -0.014897417,
+              0.03977495,
+              0.036790676,
+              0.020645684,
+              0.015943283,
+              -0.05961047,
+              0.036905374,
+              0.006005009,
+              0.033375766,
+              -0.015491932,
+              -0.07008363,
+              -0.031575754,
+              -0.0065630106,
+              -0.013962699,
+              -0.012629252,
+              0.046026245,
+              0.007901817,
+              -0.117550366,
+              -0.06314231,
+              0.05348636,
+              0.10863247,
+              0.053361807,
+              0.055756297,
+              -0.026388792,
+              -0.011777907,
+              -0.07197253,
+              0.010918023,
+              0.020021347,
+              0.14850953,
+              -0.043404948,
+              -0.04262303,
+              -0.04904758,
+              -0.014644666,
+              -0.0018742547,
+              -0.0054880613,
+              -0.015058903,
+              -0.03137978,
+              -0.09884002,
+              0.048087206,
+              -0.00044948232,
+              -0.059237186,
+              0.01681299,
+              0.06357592,
+              0.09665662,
+              -0.032431144,
+              -0.021346267,
+              -0.03630939,
+              0.108024776,
+              0.011421504,
+              0.00090062595,
+              0.09738569,
+              0.07588425,
+              -0.038476508,
+              0.008637763,
+              0.03942589,
+              0.03673421,
+              -0.008536316,
+              -0.035427485,
+              -0.0571462,
+              0.077514425,
+              -0.014574157,
+              -0.06636753,
+              0.0356625,
+              0.00055575924,
+              -0.008948914,
+              0.00082343427,
+              0.0511982,
+              0.03143358,
+              -0.03388075,
+              -0.013724427,
+              0.0551338,
+              -0.007191376,
+              -0.05363105,
+              -0.07718383,
+              -0.008230843,
+              0.10335533,
+              0.013668598,
+              -0.08284561,
+              0.05179483,
+              -0.08437943,
+              -0.017510848,
+              -0.05778264,
+              0.044004828,
+              -0.02612715,
+              -0.0058190715,
+              0.013293448,
+              -0.005663543,
+              0.0037016177,
+              -0.020699238,
+              0.00277368,
+              0.041328322,
+              -0.052624915,
+              0.020320976,
+              0.0033441507,
+              -0.11465616,
+              -0.059619453,
+              -0.029252917,
+              0.014145012,
+              -0.049234822,
+              0.025969574,
+              0.04118447,
+              0.017938918,
+              -0.009885965,
+              0.012801603,
+              -0.0007332413,
+              -0.0012993023,
+              -0.052635074,
+              0.064850755,
+              0.004576457,
+              -0.018446025,
+              -0.069130346,
+              0.018532049,
+              0.006330208,
+              0.039377607,
+              0.11237417,
+              0.055357743,
+              -0.0038629018,
+              0.048188694,
+              0.052925084,
+              -0.011272187,
+              -0.012422014,
+              0.005874242,
+              -0.0007749841,
+              -0.058404274,
+              -0.022589723,
+              0.031956926,
+              0.0470711,
+              0.027993023,
+              -0.06112344,
+              -0.0119517995,
+              -0.09797626,
+              -0.073644884,
+              0.07465703,
+              0.09884925,
+              -0.035564825,
+              -0.040369682,
+              0.014445328,
+              -0.052219898,
+              -0.027498178,
+              0.036846854,
+              -0.09408649,
+              -0.00027856976,
+              0.028489627,
+              0.002446708,
+              -0.043065134,
+              -0.030562297,
+              0.07565528,
+              -0.0256914,
+              -0.12143018,
+              0.09360902,
+              0.015026368,
+              0.058814585,
+              -0.01885037,
+              0.04901136,
+              0.009521308,
+              -0.0067844316,
+              -0.06265128,
+              0.029733902,
+              0.019703392,
+              -0.029863501,
+              0.033668272,
+              -0.015967827,
+              -0.024716265,
+              0.07095029,
+              0.07264489,
+              -0.021480447,
+              -0.040650267,
+              -0.11752601,
+              0.019378915,
+              -0.042310815,
+              0.05690114,
+              -0.01413233,
+              0.058113046,
+              -0.073345415,
+              -0.059576523,
+              -0.09720947,
+              0.012149926,
+              0.057291746,
+              -0.03505685,
+              -0.038375836,
+              0.0149342865,
+              -0.001562935,
+              -0.023513826,
+              0.00014910847,
+              0.022598296,
+              -0.071317434,
+              -0.06260575,
+              4.0522777e-05,
+              -0.086758316,
+              -0.013101295,
+              -0.02990748,
+              -0.08461068,
+              0.016139807,
+              0.06101953,
+              -0.08451055,
+              -0.046145856,
+              -0.048467644,
+              0.060105037,
+              0.024200678,
+              0.052542347,
+              0.041119967,
+              -0.0068898834,
+              0.09487794,
+              0.012641435,
+              -0.13026047,
+              0.06284531,
+              0.018659385,
+              -0.07564698,
+              0.006965884,
+              -0.036618453,
+              0.118192144,
+              -0.04771263,
+              0.023280941,
+              0.054039616,
+              -0.114724584,
+              -0.0918062,
+              0.038803104,
+              -0.09954885,
+              0.008216844,
+              -0.030975524,
+              -0.030176945,
+              0.0397766,
+              -0.0061745024,
+              0.071971394,
+              -0.041089423,
+              0.033857126,
+              0.03961017,
+              -0.03826589,
+              0.038435444,
+              -0.0860421,
+              0.08869605,
+              -0.028628873,
+              -0.05565758,
+              0.056920726,
+              0.020458337,
+              0.05994542,
+              0.08241441,
+              0.0400861,
+              -0.0045191804,
+              0.0030094406,
+              -0.007466077,
+              -0.02953672,
+              -0.068642505,
+              0.060889505,
+              -0.029501854,
+              -0.048823155,
+              0.015409609,
+              0.018862283,
+              -0.016425489,
+              -0.087497436,
+              0.067643866,
+              -0.033761434,
+              -0.054749027,
+              -0.03657711,
+              0.038102675,
+              -0.06197178,
+              0.045409728,
+              -0.02127562,
+              0.064449035,
+              -0.0056471447,
+              0.067553245,
+              -0.07137091,
+              0.017407946,
+              -0.09813906,
+              -0.046500444,
+              -0.058283363,
+              -0.018302118,
+              -0.025382183,
+              -0.04259567,
+              0.022398086,
+              -0.09098867,
+              0.043438766,
+              -0.07656342,
+              0.0028111413,
+              0.030880956,
+              -0.07750997,
+              0.07084878,
+              0.05344556,
+              0.0052658613,
+              -0.025303314,
+              -0.04759683,
+              -0.017034022,
+              0.02855913,
+              -0.04999449,
+              0.01974624,
+              0.07708244,
+              -0.011766297,
+              0.057390995,
+              -0.04652422,
+              0.023833811,
+              0.05608237,
+              0.05765577,
+              0.05078112,
+              0.046039928,
+              -0.055372067,
+              -0.044933185,
+              -0.08522771,
+              -0.09142792,
+              0.012817157,
+              -0.026148932,
+              -0.07331254,
+              0.11312438,
+              0.055893615,
+              -0.013500698,
+              0.008603385,
+              0.00057156937,
+              -0.091709465,
+              0.08057745,
+              -0.011340835,
+              -0.016915537,
+              0.0011427286,
+              0.09740327,
+              -0.029696029,
+              -0.047760956,
+              0.015541391,
+              0.0955123,
+              0.021890407,
+              -0.02908531,
+              0.030994056,
+              0.03820344,
+              -0.062488347,
+              0.015730608,
+              0.021182666,
+              -0.043783836,
+              0.02782434,
+              0.11151618,
+              0.052450567,
+              0.00037089732,
+              0.03351987,
+              -0.0054050605,
+              -0.033424556,
+              0.10350312,
+              0.065157756,
+              0.03392563,
+              0.010131469,
+              -0.053846426,
+              -0.0022781377,
+              0.0014610494,
+              0.005763698,
+              0.0426489,
+              -0.08206464,
+              -0.07099776,
+              -0.04228286,
+              0.07337842,
+              0.047744617,
+              0.04284143,
+              0.06959166,
+              0.013133698,
+              -0.030711556,
+              0.009055728,
+              0.06162162,
+              0.017240932,
+              -0.039795205,
+              -0.10877084,
+              0.024329182,
+              -0.0049141976,
+              -0.038892467,
+              -0.012901915,
+              -0.095080145,
+              0.05290344,
+              0.021141307,
+              0.03017632,
+              -0.0044154925,
+              -0.10163907,
+              -0.08186605,
+              -0.023801327,
+              0.035552323,
+              0.039041802,
+              -0.032427292,
+              0.07541,
+              0.10233232,
+              0.018622704,
+              -0.013646388,
+              -0.008619573,
+              0.020216271,
+              -0.07897946,
+              0.063637026,
+              -0.08652915,
+              -0.0100032855,
+              0.046902858,
+              0.076707095,
+              0.02531022,
+              0.05425257,
+              0.015954422,
+              -0.033368777,
+              -0.025112148,
+              -0.01394599,
+              -0.04062625,
+              0.056534503,
+              -0.04304168,
+              -0.060214523,
+              0.016551849,
+              -0.006314451,
+              0.060458317,
+              0.027808908,
+              0.040655438,
+              -0.031415448,
+              -0.120496035,
+              -0.04355332,
+              0.002170874,
+              0.013876282,
+              -0.011508199,
+              -0.046841078,
+              0.076444104,
+              0.08982719,
+              0.0846208,
+              0.029678846,
+              -0.086331986,
+              0.14421903,
+              -0.0030989156,
+              0.01598773,
+              0.059804816,
+              -0.0464971,
+              -0.0058899643,
+              0.02542227,
+              -0.020552263,
+              0.10621325,
+              -0.023809364,
+              -0.13324538,
+              -0.075492345,
+              0.06716611,
+              -0.040477127,
+              -0.046582364,
+              -0.07376809,
+              0.024235222,
+              0.070477486,
+              0.11006968,
+              -0.04869493,
+              0.078016356,
+              -0.07615679,
+              0.08063025,
+              -0.016255612,
+              -0.051746953,
+              0.08059405,
+              -0.0025989392,
+              -0.073428795,
+              -0.03987752,
+              0.098251894,
+              -0.006217126,
+              -0.028130062,
+              -0.051326722,
+              -0.0470711,
+              -0.016759045,
+              -0.039230157,
+              -0.020525763,
+              0.07148479,
+              -0.05419997,
+              -0.025775867,
+              0.0070432695,
+              -0.006410803,
+              0.027631486,
+              0.037966132,
+              -0.025654731,
+              -0.023324372,
+              0.026257442,
+              -0.034822363,
+              -0.010826962,
+              0.020623349,
+              0.0523646,
+              -0.022230538,
+              0.028196862,
+              0.023292363,
+              0.12025986,
+              -0.022648653,
+              -0.061013527,
+              -0.040045265,
+              0.022293845,
+              -0.016287014,
+              -0.08896512,
+              -0.021426601,
+              0.05109808,
+              0.038455352,
+              0.055882193,
+              0.10342665,
+              0.06503611,
+              0.07195616,
+              -0.013601524,
+              0.028618002,
+              0.03990776,
+              0.03236452,
+              0.07085622,
+              0.0055737793,
+              0.013130723,
+              -0.066394895,
+              0.021342268,
+              0.0026651763,
+              -0.012577644,
+              0.049445108,
+              0.049437333,
+              0.0047207237,
+              -0.02006381,
+              0.02022424,
+              0.05142978,
+              0.01725655,
+              0.00037797724,
+              0.039846063,
+              -0.11509461,
+              -0.013602717,
+              -0.066661686,
+              -0.020612884,
+              0.012832718,
+              -0.091352694,
+              -0.09389515,
+              0.07369748,
+              0.056452867,
+              0.10581744,
+              -0.06383743,
+              0.036662158,
+              -0.07204409,
+              0.012689036,
+              -0.025724197,
+              0.040817674,
+              -0.06890574,
+              0.0055584335,
+              0.031956017,
+              0.0014588524,
+              0.098465145,
+              0.0054196557,
+              0.056656968,
+              0.03322914,
+              -0.040962957,
+              -0.015689995,
+              -0.034545593,
+              -0.052660752,
+              -0.044768244,
+              -0.04419147,
+              -0.11039146,
+              0.015522225,
+              0.0052053384,
+              -0.08471112,
+              0.025280464,
+              -0.03353502,
+              -0.018717872,
+              -0.020738749,
+              0.0021664763,
+              -0.011238148,
+              0.02322494,
+              0.010894536,
+              -0.09676859,
+              0.01013113,
+              0.0035604087,
+              -0.0060942546,
+              -0.027839229,
+              -0.0037214137,
+              0.053193003,
+              -0.070640355,
+              -0.07783396,
+              0.005814805,
+              0.0064411093,
+              -0.023913933,
+              0.030543711,
+              -0.07979223,
+              -0.008982119,
+              0.043360766,
+              -0.048063844,
+              0.0017047173,
+              0.06882568,
+              -0.03443207,
+              0.015080402,
+              -0.049461022,
+              0.045471057,
+              -0.031460688,
+              -0.0028212033,
+              0.044725604,
+              0.0026248703,
+              -0.0329393,
+              -0.034404054,
+              0.024516258,
+              0.002614168,
+              -0.047855787,
+              -0.03149,
+              0.14646776,
+              -0.047660008,
+              0.021453902
+            ],
+            "index": 0,
+            "object": "embedding"
+          }
+        ],
+        "model": "togethercomputer/m2-bert-80M-32k-retrieval",
+        "object": "list",
+        "usage": null
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/27463384d1a3.json
+++ b/tests/integration/recordings/responses/27463384d1a3.json
@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:8080/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "Qwen/Qwen3-0.6B",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, world!"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "Qwen/Qwen3-0.6B"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "<think>\nOkay, the user just said \"Hello, world!\" so I need to respond in a friendly way. My prompt says to respond in the same style, so I should start with \"Hello, world!\" but maybe add some helpful information. Let me think. Since the user is probably testing or just sharing, a simple \"Hello, world!\" with a question would be best for user interaction. I'll make sure to keep it positive and open-ended.\n</think>\n\nHello, world! \ud83d\ude0a What do you need today?",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1757550395,
+        "model": "Qwen/Qwen3-0.6B",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "3.3.5-dev0-sha-1b90c50",
+        "usage": {
+          "completion_tokens": 108,
+          "prompt_tokens": 12,
+          "total_tokens": 120,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/3a81146f2afa.json
+++ b/tests/integration/recordings/responses/3a81146f2afa.json
@ -0,0 +1,990 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Respond to this question and explain your answer. Complete the sentence using one word: Roses are red, violets are ",
+      "max_tokens": 50,
+      "stream": true,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "Blue"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ".\n\n"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "The"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " completed"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " sentence"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " is"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " a"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " well"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "-known"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " phrase"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " from"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " a"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " traditional"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " English"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " poem"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ":\n\n"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "\""
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "R"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "oses"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " are"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " red"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ","
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " v"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "io"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "lets"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " are"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " blue"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ",\n"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "Sugar"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " is"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " sweet"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ","
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " and"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " so"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " are"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " you"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ".\""
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " However"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ","
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " in"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " many"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " variations"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " of"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " this"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " poem"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ","
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " the"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " line"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " \""
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "vio"
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.completion.Completion",
+        "__data__": {
+          "id": "cmpl-439",
+          "choices": [
+            {
+              "finish_reason": "length",
+              "index": 0,
+              "logprobs": null,
+              "text": ""
+            }
+          ],
+          "created": 1757857132,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "text_completion",
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/432a346b2ed8.json
+++ b/tests/integration/recordings/responses/432a346b2ed8.json
--- a/tests/integration/recordings/responses/4ca6152a0eb8.json
+++ b/tests/integration/recordings/responses/4ca6152a0eb8.json
@ -0,0 +1,59 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Which planet has rings around it with a name starting with letter S?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "oBUtaEp-62bZhn-9801a2718d0ed123",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "The planet with rings around it that starts with the letter S is Saturn. Saturn's ring system is one of the most prominent and well-known in our solar system.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": []
+            },
+            "seed": 2387155844510162400
+          }
+        ],
+        "created": 1758039032,
+        "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 34,
+          "prompt_tokens": 49,
+          "total_tokens": 83,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null,
+          "cached_tokens": 0
+        },
+        "prompt": []
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/4d4440c8641b.json
+++ b/tests/integration/recordings/responses/4d4440c8641b.json
--- a/tests/integration/recordings/responses/511eb1b92e34.json
+++ b/tests/integration/recordings/responses/511eb1b92e34.json
--- a/tests/integration/recordings/responses/565b1072cb9d.json
+++ b/tests/integration/recordings/responses/565b1072cb9d.json
@ -0,0 +1,46 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+      "prompt": "Respond to this question and explain your answer. Complete the sentence using one word: Roses are red, violets are ",
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "oBUswCe-62bZhn-98019f663cac0f68",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": " _______________________. \n\n## Step 1: Identify the traditional completion of the sentence.\nThe traditional completion of the sentence \"Roses are red, violets are...\" is based on a well-known poem.\n\n## Step 2: Recall the poem.\nThe poem states, \"Roses are red, violets are blue...\"\n\n## Step 3: Determine the word that completes the sentence.\nBased on the poem, the word that completes the sentence is \"blue\".\n\nThe final answer is: $\\boxed{blue}$",
+            "seed": 4892505926413923000
+          }
+        ],
+        "created": 1758038908,
+        "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+        "object": "text.completion",
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 106,
+          "prompt_tokens": 25,
+          "total_tokens": 131,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null,
+          "cached_tokens": 0
+        },
+        "prompt": []
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/6412295819a1.json
+++ b/tests/integration/recordings/responses/6412295819a1.json
@ -0,0 +1,43 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "prompt": "Respond to this question and explain your answer. Complete the sentence using one word: Roses are red, violets are ",
+      "stream": false,
+      "extra_body": {}
+    },
+    "endpoint": "/v1/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.completion.Completion",
+      "__data__": {
+        "id": "cmpl-104",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "text": "blue.\n\nI completed the sentence with \"blue\" because it is a common completion used to complete the traditional nursery rhyme, which ends with:\n\nRoses are red,\nViolets are blue.\n\nThe complete rhyme is often remembered and recited as follows:\n\nRoses are red,\nViolets are blue,\nSugar is sweet,\nAnd so are you!"
+          }
+        ],
+        "created": 1757857132,
+        "model": "llama3.2:3b-instruct-fp16",
+        "object": "text_completion",
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 72,
+          "prompt_tokens": 50,
+          "total_tokens": 122,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/6730dcde0b73.json
+++ b/tests/integration/recordings/responses/6730dcde0b73.json
@ -0,0 +1,756 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Hello, world!"
+        }
+      ],
+      "stream": true
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": "Hello",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 9906
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "Hello",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": "!",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "!",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " It",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 1102
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " It",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": "'s",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 596
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "'s",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " nice",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 6555
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " nice",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " to",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 311
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " to",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " meet",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 3449
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " meet",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " you",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 499
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " you",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 13
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": ".",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " Is",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 2209
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " Is",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " there",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 1070
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " there",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " something",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 2555
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " something",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " I",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 358
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " I",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " can",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 649
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " can",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " help",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 1520
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " help",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " you",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 499
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " you",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " with",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 449
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " with",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " or",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 477
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " or",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " would",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 1053
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " would",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " you",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 499
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " you",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " like",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 1093
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " like",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " to",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 311
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " to",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": " chat",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 6369
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": " chat",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": "?",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 30
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null,
+              "text": "?",
+              "seed": null
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtTzC-62bZhn-9801a1ee1bea25d8",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 128009
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null,
+              "text": "",
+              "seed": 16158686754257986000
+            }
+          ],
+          "created": 1758039011,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": {
+            "completion_tokens": 25,
+            "prompt_tokens": 39,
+            "total_tokens": 64,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null,
+            "cached_tokens": 0
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/recordings/responses/6857b19d3f0a.json
+++ b/tests/integration/recordings/responses/6857b19d3f0a.json
@ -0,0 +1,87 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
+        }
+      ],
+      "stream": false,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the weather in a given city",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "city": {
+                  "type": "string",
+                  "description": "The city to get the weather for"
+                }
+              }
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "oBUth9w-62bZhn-9801a3026bd20c8a",
+        "choices": [
+          {
+            "finish_reason": "tool_calls",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": null,
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": [
+                {
+                  "id": "call_8prwkicthj6bjfqa9ye64y2b",
+                  "function": {
+                    "arguments": "{\"city\":\"Tokyo\"}",
+                    "name": "get_weather"
+                  },
+                  "type": "function",
+                  "index": 0
+                }
+              ]
+            },
+            "seed": 977986247412336500
+          }
+        ],
+        "created": 1758039055,
+        "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 24,
+          "prompt_tokens": 193,
+          "total_tokens": 217,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null,
+          "cached_tokens": 0
+        },
+        "prompt": []
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/6c4e2e207e8a.json
+++ b/tests/integration/recordings/responses/6c4e2e207e8a.json
@ -0,0 +1,59 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Which planet do humans live on?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "oBUtMpf-62bZhn-9801a16bc8d642d3",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "Humans live on Earth.",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": []
+            },
+            "seed": 14150443913665712000
+          }
+        ],
+        "created": 1758038990,
+        "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": null,
+        "usage": {
+          "completion_tokens": 6,
+          "prompt_tokens": 42,
+          "total_tokens": 48,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null,
+          "cached_tokens": 0
+        },
+        "prompt": []
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/72e075bf28e8.json
+++ b/tests/integration/recordings/responses/72e075bf28e8.json
@ -0,0 +1,800 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/embeddings",
+    "headers": {},
+    "body": {
+      "model": "togethercomputer/m2-bert-80M-32k-retrieval",
+      "input": "Hello, world!"
+    },
+    "endpoint": "/v1/embeddings",
+    "model": "togethercomputer/m2-bert-80M-32k-retrieval"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
+      "__data__": {
+        "data": [
+          {
+            "embedding": [
+              -0.017041557,
+              -0.07436493,
+              0.02897635,
+              -0.032216743,
+              0.0056444216,
+              -0.029015187,
+              0.06512343,
+              -0.040310342,
+              0.05263593,
+              0.0068842396,
+              0.019191971,
+              -0.0064884443,
+              -0.01664521,
+              0.014244285,
+              0.036390014,
+              -0.040292,
+              0.031780273,
+              0.0039553884,
+              -0.055303488,
+              -0.028992416,
+              -0.02059435,
+              0.05677091,
+              -0.043668333,
+              -0.014273451,
+              0.15328151,
+              -0.023603301,
+              -0.049825363,
+              0.007869072,
+              -0.010882995,
+              -0.033912696,
+              0.053697765,
+              -0.00093928695,
+              0.0017799847,
+              0.038871024,
+              -0.069678165,
+              -0.067093275,
+              0.025772842,
+              -0.057590123,
+              -0.015825877,
+              0.020131286,
+              0.020742312,
+              0.003915491,
+              -0.018451879,
+              0.020440312,
+              -0.023613403,
+              -0.039568678,
+              -0.013152008,
+              -0.01871725,
+              0.021348018,
+              -0.019964654,
+              0.038607903,
+              0.018397795,
+              -0.0063561443,
+              -0.018936336,
+              -0.060981557,
+              -0.02152846,
+              0.027057847,
+              0.0014626224,
+              -0.018241309,
+              -0.07473041,
+              -0.02377323,
+              -0.033910733,
+              0.02569418,
+              -0.024951216,
+              -0.0076659806,
+              -0.015425462,
+              0.006604636,
+              0.09833969,
+              -0.005054596,
+              0.008841989,
+              -0.01836461,
+              -0.018554095,
+              0.011605144,
+              -0.016599955,
+              -0.062196333,
+              -0.0037542647,
+              -0.025220644,
+              -0.027834827,
+              -0.020460974,
+              -0.050503097,
+              0.032119684,
+              -0.023387104,
+              0.050067227,
+              -0.05834235,
+              0.023189448,
+              -0.021862485,
+              0.023831544,
+              -0.016663097,
+              -0.041609522,
+              0.025361128,
+              0.002924296,
+              0.01852158,
+              0.08960255,
+              -0.003265466,
+              -0.058762494,
+              -0.06428431,
+              -0.014671485,
+              -0.046800107,
+              0.02691456,
+              -0.0059303525,
+              -0.015431455,
+              0.022179665,
+              0.014044907,
+              0.012218545,
+              0.0053836405,
+              -0.025096457,
+              0.009438382,
+              0.032498095,
+              0.06879721,
+              0.056900814,
+              0.019497631,
+              -0.122159146,
+              -0.106994465,
+              -0.017456975,
+              0.047223866,
+              0.06569824,
+              0.04780035,
+              0.018039258,
+              -0.0011028647,
+              -0.05067006,
+              0.0106863845,
+              0.027489506,
+              -0.014593985,
+              -0.039851535,
+              -0.09175489,
+              0.037555773,
+              -0.060439512,
+              0.008525801,
+              0.0071557434,
+              -0.057973035,
+              -0.054225244,
+              0.051505033,
+              -0.0008626373,
+              0.069083415,
+              0.064380065,
+              0.09843996,
+              0.0062191207,
+              -0.041505292,
+              -0.05381256,
+              -0.0073601264,
+              -0.03288613,
+              0.011711341,
+              -0.09244605,
+              0.0069717136,
+              -0.05722877,
+              0.041075893,
+              0.06521969,
+              -0.0018537377,
+              0.016272636,
+              0.008761483,
+              -0.029342752,
+              0.020412564,
+              -0.07015791,
+              0.033616304,
+              0.039998446,
+              0.01602917,
+              0.044467725,
+              -0.08176377,
+              -0.036885373,
+              0.03468746,
+              0.0024068495,
+              0.00056306267,
+              0.02546511,
+              -0.053339135,
+              -0.027220095,
+              -0.021510394,
+              0.054806393,
+              -0.005447777,
+              -0.05690438,
+              -0.028497366,
+              0.01873974,
+              -0.035461064,
+              -0.00019089226,
+              -0.04914238,
+              0.030303763,
+              0.013396073,
+              0.015789565,
+              -0.07714792,
+              -0.062155712,
+              -0.00677417,
+              0.02850476,
+              0.031491462,
+              0.014566345,
+              0.012163924,
+              0.11814501,
+              -0.0043511004,
+              -0.017920421,
+              0.004205825,
+              -0.0015928322,
+              -0.012145554,
+              0.01663168,
+              -0.071173735,
+              0.0029570858,
+              0.12899451,
+              0.004157568,
+              0.010501232,
+              0.07710632,
+              0.062119417,
+              0.021002673,
+              -0.023212241,
+              -0.04327007,
+              -0.0567023,
+              0.04590105,
+              0.0019161925,
+              0.02637205,
+              0.029331107,
+              -0.029769177,
+              -0.050466795,
+              -0.08057371,
+              0.007419741,
+              -0.008777471,
+              0.02217743,
+              0.013535721,
+              0.03426775,
+              0.04592361,
+              0.009423588,
+              -0.023030678,
+              -0.024462381,
+              0.054334357,
+              0.06710402,
+              0.077300854,
+              0.0300022,
+              -0.0035417816,
+              -0.0046773576,
+              -0.0927158,
+              -0.0218652,
+              -0.043468982,
+              -0.035734102,
+              -0.038873542,
+              -0.0412869,
+              -0.016015923,
+              0.0038303286,
+              0.08523618,
+              -0.05200533,
+              -0.014904317,
+              -0.016793448,
+              0.04478206,
+              -0.017161047,
+              0.02638292,
+              0.007849463,
+              -0.040533304,
+              -0.017599737,
+              0.047704253,
+              0.034988616,
+              -0.013908102,
+              0.044121094,
+              0.040395457,
+              -0.010402818,
+              0.0063570403,
+              -0.014962749,
+              0.025776524,
+              0.023681043,
+              0.006042675,
+              0.017647373,
+              0.016301101,
+              -0.07793374,
+              -0.004771094,
+              0.012728924,
+              -0.00047885205,
+              -0.051591527,
+              0.03612118,
+              -0.02209703,
+              0.052075963,
+              -0.021613466,
+              -0.026258182,
+              0.008102769,
+              -0.04963262,
+              0.00062747014,
+              -0.012579783,
+              0.076374784,
+              -0.047350414,
+              -0.007680664,
+              0.062471915,
+              -0.0061351187,
+              -0.043617643,
+              0.023878522,
+              -0.09653609,
+              0.018392054,
+              -0.039719462,
+              0.065271765,
+              0.034548305,
+              0.004219043,
+              -0.003628092,
+              0.0047836183,
+              0.0132732885,
+              -0.028140727,
+              -0.015683327,
+              -0.052812085,
+              -0.019410037,
+              0.06812139,
+              -0.041178964,
+              0.014646207,
+              -0.0037439142,
+              0.0003088275,
+              -0.04985693,
+              0.0223661,
+              0.008887433,
+              0.0049061268,
+              0.042707395,
+              -0.021471359,
+              -0.06471383,
+              0.0022036259,
+              0.030178884,
+              -0.002764245,
+              -0.0063233464,
+              -0.04146522,
+              -0.008236624,
+              0.0037351896,
+              -0.027550086,
+              -0.0137326885,
+              0.0055276263,
+              0.0016785853,
+              0.050191414,
+              0.02629574,
+              -0.009129228,
+              0.06351977,
+              -0.037435655,
+              0.0467174,
+              -0.012987377,
+              -0.007550927,
+              -0.004503205,
+              0.010520655,
+              0.064984836,
+              0.009879768,
+              0.055787366,
+              -0.042653065,
+              0.024189176,
+              0.0378726,
+              -0.032453574,
+              0.043519154,
+              0.020133087,
+              -0.055212636,
+              -0.016188117,
+              0.03764466,
+              -0.022142444,
+              0.11164031,
+              0.019020407,
+              -0.008950892,
+              0.0517199,
+              0.0014494535,
+              0.041113462,
+              -0.0912906,
+              -0.04723132,
+              0.008548748,
+              0.028231544,
+              0.023689618,
+              -0.039103802,
+              -0.034011997,
+              -0.04731894,
+              0.03309799,
+              -0.044572156,
+              -0.116778485,
+              -0.028786778,
+              0.05798776,
+              0.05287191,
+              -0.0039562676,
+              -0.08213019,
+              -0.01224603,
+              -0.012757768,
+              0.035721667,
+              0.012440343,
+              0.0053813523,
+              -0.072770126,
+              0.0066190604,
+              0.038976185,
+              -0.037760906,
+              -0.0031381482,
+              -0.052277293,
+              -0.016870236,
+              -0.053451907,
+              -0.05629483,
+              -0.034493946,
+              -0.0048654405,
+              0.022051724,
+              0.028501945,
+              0.025858566,
+              -0.023936177,
+              -0.098391004,
+              -0.030646492,
+              -0.049461726,
+              -0.00086931954,
+              0.03593346,
+              0.015843417,
+              -0.03276966,
+              0.008957432,
+              -0.022735167,
+              -0.012159252,
+              0.07607085,
+              -0.059834506,
+              0.004478244,
+              0.03439635,
+              0.03683821,
+              0.062883355,
+              0.054430448,
+              -0.029807799,
+              0.0032295138,
+              0.08891875,
+              -0.026941199,
+              -0.00618463,
+              -0.022683868,
+              -0.024138795,
+              -0.036633875,
+              0.02097464,
+              -0.003001584,
+              0.020455033,
+              0.043717608,
+              0.06566654,
+              -0.029039463,
+              -0.0066977167,
+              -0.04504434,
+              0.022257777,
+              0.054422457,
+              0.029796708,
+              0.009008146,
+              0.028205348,
+              0.06255052,
+              -0.004475601,
+              0.059329458,
+              -0.038065027,
+              -0.027933009,
+              -0.07060949,
+              0.013978787,
+              -0.051300917,
+              0.02945564,
+              -0.008552103,
+              -0.009436655,
+              0.039747514,
+              -0.016741823,
+              0.04740887,
+              0.03521937,
+              -0.012574282,
+              -0.089222826,
+              -0.043515395,
+              -0.04158566,
+              0.0016020355,
+              0.02684753,
+              -0.019394692,
+              -0.02156877,
+              0.06316388,
+              0.01663444,
+              0.015482924,
+              0.047349654,
+              -0.028341234,
+              0.013805591,
+              -0.010708488,
+              -0.07627738,
+              0.08611209,
+              0.0089956885,
+              0.034438204,
+              0.016312746,
+              -0.03412846,
+              0.0770598,
+              -0.06790466,
+              0.036359854,
+              0.08038976,
+              0.023465984,
+              -0.019832904,
+              -0.0011524013,
+              -0.03804293,
+              0.04106918,
+              -0.028220456,
+              0.032340813,
+              -0.030669356,
+              -0.004353358,
+              -0.019439798,
+              0.0020563425,
+              0.03015629,
+              -0.06430176,
+              0.0034439075,
+              -0.045720384,
+              -0.06526568,
+              -0.0004192516,
+              -0.016580455,
+              -0.012596616,
+              0.039126,
+              -0.04699455,
+              -0.008973794,
+              0.015056125,
+              0.018929023,
+              -0.07840811,
+              -0.014792519,
+              -0.0044317124,
+              0.019588342,
+              0.035912346,
+              -0.035739247,
+              0.058755044,
+              -0.01856197,
+              0.021155646,
+              -0.073580906,
+              -0.04310776,
+              -0.023147091,
+              -0.010232029,
+              0.06352039,
+              0.039570276,
+              0.020424508,
+              0.051613245,
+              0.013395984,
+              -0.003908009,
+              -0.04643392,
+              0.019592889,
+              -0.008484923,
+              0.0031434586,
+              -0.046069775,
+              -0.01765311,
+              -0.041277196,
+              -0.070297986,
+              0.012561737,
+              -0.003500738,
+              -0.01729488,
+              -0.0033254062,
+              0.053035453,
+              -0.054218896,
+              -0.029708259,
+              -0.0047281524,
+              0.019236762,
+              -0.12249525,
+              0.03018237,
+              -0.028753102,
+              -0.031858314,
+              0.0811298,
+              -0.005711499,
+              -0.057587985,
+              0.014153141,
+              0.0006705577,
+              -0.024263157,
+              0.016729265,
+              -0.03195949,
+              -0.007259763,
+              -0.0035231581,
+              -0.03890975,
+              0.011460382,
+              -0.06591321,
+              -0.023756726,
+              -0.023958001,
+              0.030074941,
+              -0.0040949634,
+              -0.048368257,
+              -0.029692868,
+              0.027246583,
+              -0.024747347,
+              0.014442731,
+              -0.00832639,
+              -0.0002390868,
+              -0.013635633,
+              0.0035843733,
+              0.02354072,
+              -0.012829061,
+              -0.0060750768,
+              -0.044952527,
+              -0.05725624,
+              0.031746052,
+              -0.024419094,
+              0.032444403,
+              -0.029308707,
+              0.034302235,
+              -0.022495607,
+              0.015296428,
+              -0.0057196384,
+              -7.8588724e-05,
+              0.060303975,
+              0.06299601,
+              0.028222265,
+              -0.0071411408,
+              0.015196491,
+              0.02031155,
+              0.039635558,
+              0.079736926,
+              0.008736669,
+              -0.023079613,
+              -0.04490686,
+              -0.021764707,
+              -0.015199573,
+              0.036019534,
+              -0.0046079857,
+              0.04429082,
+              -0.04291344,
+              -0.05991891,
+              -0.006501417,
+              0.010603077,
+              0.03435066,
+              -0.065568395,
+              -0.04424192,
+              0.035055783,
+              0.019717937,
+              0.032764338,
+              0.021240309,
+              -0.01646063,
+              0.007835414,
+              0.06857148,
+              -0.013750999,
+              0.028333688,
+              -0.078255735,
+              -0.047899257,
+              -0.0006370693,
+              0.012606231,
+              0.012178417,
+              -0.013057751,
+              -0.008095854,
+              -0.013466724,
+              0.019036459,
+              -0.025450038,
+              0.021131655,
+              -0.02505666,
+              0.012961284,
+              0.0004236046,
+              -0.023920864,
+              -0.055114083,
+              0.082351916,
+              0.028973032,
+              0.025259241,
+              0.098259576,
+              -0.007385416,
+              0.003546012,
+              -0.05316339,
+              -0.04186183,
+              0.043638214,
+              -0.069299474,
+              -0.013284585,
+              -0.010019175,
+              0.012883975,
+              0.014200739,
+              -0.013508286,
+              0.0086570075,
+              -0.020393575,
+              0.10617594,
+              0.028786503,
+              -0.018674662,
+              0.026763268,
+              -0.0062548965,
+              -0.07215284,
+              0.055464335,
+              0.0029595464,
+              -0.009364344,
+              -0.096402094,
+              0.02823341,
+              -0.022853011,
+              0.04750492,
+              0.008378555,
+              0.016491622,
+              0.01860681,
+              0.048116222,
+              0.106049344,
+              -0.028929656,
+              -0.008896546,
+              0.033615295,
+              -0.0070807124,
+              -0.05684197,
+              -0.061439563,
+              0.0060220268,
+              0.046171866,
+              -0.01574131,
+              -0.07562956,
+              0.0024098414,
+              0.0006304895,
+              -0.07831614,
+              0.060869616,
+              0.00076000375,
+              -0.008209363,
+              -0.04139266,
+              -0.085268535,
+              -0.028194478,
+              -0.024567788,
+              -0.04218179,
+              0.023546752,
+              0.036236234,
+              0.017199656,
+              -0.03315456,
+              -0.023814544,
+              0.038755447,
+              -0.023165299,
+              -0.049283065,
+              -0.006907019,
+              0.040826146,
+              0.017533792,
+              -0.036849793,
+              -0.015506943,
+              -0.010768763,
+              -0.08758806,
+              -0.0295733,
+              0.055843282,
+              -0.012555046,
+              0.0076235603,
+              0.008802991,
+              0.026661193,
+              -0.023899797,
+              0.043548774,
+              -0.034339137,
+              -0.027354732,
+              -0.07583677,
+              0.020500224,
+              0.036802996,
+              0.031019075,
+              0.04605757,
+              -0.004433706,
+              0.0108612785,
+              0.050121468,
+              -0.07816735,
+              -0.014776514,
+              -0.04565195,
+              -0.0036854912,
+              0.0075577567,
+              -0.017044865,
+              0.030597543,
+              -0.013623054,
+              -0.0648466,
+              -0.0318741,
+              -0.059455115,
+              -0.024783187,
+              -0.0088010235,
+              0.11127796,
+              0.03429834,
+              -0.010424589,
+              -0.06355135,
+              0.034265812,
+              0.02680333,
+              -0.007930513,
+              0.030092249,
+              0.008321974,
+              0.03125566,
+              -0.06832331,
+              -0.0076806936,
+              0.034010306,
+              -0.087202646,
+              -0.047684345,
+              0.06384632,
+              -0.026591811,
+              -0.0016003181,
+              0.05721666,
+              -0.0024700803,
+              -0.029714238,
+              0.07761957,
+              -0.04561395,
+              -0.053199258,
+              0.030417573,
+              -0.01958724,
+              0.0012449475,
+              -0.04003076,
+              0.08825553,
+              -0.023196172,
+              -0.08629044,
+              -0.049815316,
+              0.027229005,
+              0.0021765123,
+              0.03438692,
+              -0.09314263,
+              -0.019655729,
+              0.018762926,
+              0.025670087,
+              -0.017116003,
+              0.031716976,
+              -0.05509443,
+              0.032953184,
+              -0.02264915,
+              0.04861606,
+              -0.050201602,
+              0.033154316,
+              0.009971947,
+              -0.037610047,
+              0.016600395,
+              -0.031037569,
+              -0.015495428,
+              0.026365642,
+              -0.043527953,
+              0.055781424,
+              0.06780075,
+              -0.015966192,
+              0.03201043,
+              0.028026119
+            ],
+            "index": 0,
+            "object": "embedding"
+          }
+        ],
+        "model": "togethercomputer/m2-bert-80M-32k-retrieval",
+        "object": "list",
+        "usage": null
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/7ef63231b9f8.json
+++ b/tests/integration/recordings/responses/7ef63231b9f8.json
@ -0,0 +1,56 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://localhost:8080/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "Qwen/Qwen3-0.6B",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Which planet has rings around it with a name starting with letter S?"
+        }
+      ],
+      "stream": false
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "Qwen/Qwen3-0.6B"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "<think>\nOkay, so the user is asking which planet has rings around it and its name starts with the letter S. Let me think... I know that the Sun is a star, not a planet. So the Moon is a natural satellite, which has the Moon's name and rings. But the Moon's name starts with M, not S. The Earth has the name Earth, but the rings aren't really around the Earth in any real sense. Mars has a thin ring of dust. Venus and Mercury don't have rings in the sense of planetary rings as we know. Wait, maybe the answer is the Moon, even though it's not the same as the name starting with S. But the question says a planet, so if there's a planet named S, that would be it. But actually, the only planet with rings is Jupiter. Wait, Jupiter has a famous system of rings. But why does the question mention a planet with a name starting with S? Maybe there's a trick. Let me double-check. Jupiter's name starts with J, so maybe the answer is Venus? But Venus doesn't have rings. Mercury, too, doesn't. The Moon, as a planet, a dwarf planet, and has rings. Despite the name, the rings are around it. So the answer would be the Moon. Therefore, the planet with rings and name starting with S is the Moon.\n</think>\n\nThe planet with rings around it and a name starting with the letter **S** is the **Moon**. Though its name doesn't start with an **S**, it is technically a dwarf planet and has the rings in its orbit. Oops Saturn!",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 1757550394,
+        "model": "Qwen/Qwen3-0.6B",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "3.3.5-dev0-sha-1b90c50",
+        "usage": {
+          "completion_tokens": 336,
+          "prompt_tokens": 22,
+          "total_tokens": 358,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/894fdacb1cfa.json
+++ b/tests/integration/recordings/responses/894fdacb1cfa.json
@ -0,0 +1,176 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "https://api.together.xyz/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+      "messages": [
+        {
+          "role": "user",
+          "content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
+        }
+      ],
+      "stream": true,
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "description": "Get the weather in a given city",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "city": {
+                  "type": "string",
+                  "description": "The city to get the weather for"
+                }
+              }
+            }
+          }
+        }
+      ]
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtX7R-62bZhn-9801a22f6ad243dc",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758039022,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtX7R-62bZhn-9801a22f6ad243dc",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_jy63yt7kp8hfof3sy4pim94o",
+                    "function": {
+                      "arguments": "",
+                      "name": "get_weather"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758039022,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtX7R-62bZhn-9801a22f6ad243dc",
+          "choices": [
+            {
+              "delta": {
+                "content": null,
+                "function_call": null,
+                "refusal": null,
+                "role": null,
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": null,
+                    "function": {
+                      "arguments": "{\"city\":\"Tokyo\"}",
+                      "name": null
+                    },
+                    "type": null
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 1758039022,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "oBUtX7R-62bZhn-9801a22f6ad243dc",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null,
+                "token_id": 128008
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null,
+              "text": "",
+              "seed": 1489065696184500700
+            }
+          ],
+          "created": 1758039022,
+          "model": "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": null,
+          "usage": {
+            "completion_tokens": 24,
+            "prompt_tokens": 193,
+            "total_tokens": 217,
+            "completion_tokens_details": null,
+            "prompt_tokens_details": null,
+            "cached_tokens": 0
+          }
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/Show more
+++ b/Show more