Merge

Signed-off-by: Bill Murdock <bmurdock@redhat.com>
2025-12-12 12:06:04 +00:00 · 2025-10-06 16:19:57 -04:00 · 2025-10-06 16:19:57 -04:00 · e77b7a127c
commit e77b7a127c
parent 1d941b6aa0 696fefbf17
854 changed files with 165238 additions and 99099 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1 @@
 # This file documents Triage members in the Llama Stack community
- @franciscojavierarceo
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -12,6 +12,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
+| Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -42,18 +42,27 @@ jobs:

  run-replay-mode-tests:
    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}
+    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}

    strategy:
      fail-fast: false
      matrix:
        client-type: [library, server]
-        # Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
-        setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        suite: [base, vision]
+        # Define (setup, suite) pairs - they are always matched and cannot be independent
+        # Weekly schedule (Sun 1 AM): vllm+base
+        # Input test-setup=ollama-vision: ollama-vision+vision
+        # Default (including test-setup=ollama): both ollama+base and ollama-vision+vision
+        config: >-
+          ${{
+            github.event.schedule == '1 0 * * 0'
+              && fromJSON('[{"setup": "vllm", "suite": "base"}]')
+            || github.event.inputs.test-setup == 'ollama-vision'
+              && fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
+            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}]')
+          }}

    steps:
      - name: Checkout repository
@ -64,14 +73,14 @@ jobs:
        with:
          python-version: ${{ matrix.python-version }}
          client-version: ${{ matrix.client-version }}
-          setup: ${{ matrix.setup }}
-          suite: ${{ matrix.suite }}
+          setup: ${{ matrix.config.setup }}
+          suite: ${{ matrix.config.suite }}
          inference-mode: 'replay'

      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
-          setup: ${{ matrix.setup }}
+          setup: ${{ matrix.config.setup }}
          inference-mode: 'replay'
-          suite: ${{ matrix.suite }}
+          suite: ${{ matrix.config.suite }}
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -0,0 +1,227 @@
+name: Pre-commit Bot
+
+run-name: Pre-commit bot for PR #${{ github.event.issue.number }}
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  pre-commit:
+    # Only run on pull request comments
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '@github-actions run precommit')
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Check comment author and get PR details
+        id: check_author
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            // Get PR details
+            const pr = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: context.issue.number
+            });
+
+            // Check if commenter has write access or is the PR author
+            const commenter = context.payload.comment.user.login;
+            const prAuthor = pr.data.user.login;
+
+            let hasPermission = false;
+
+            // Check if commenter is PR author
+            if (commenter === prAuthor) {
+              hasPermission = true;
+              console.log(`Comment author ${commenter} is the PR author`);
+            } else {
+              // Check if commenter has write/admin access
+              try {
+                const permission = await github.rest.repos.getCollaboratorPermissionLevel({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  username: commenter
+                });
+
+                const level = permission.data.permission;
+                hasPermission = ['write', 'admin', 'maintain'].includes(level);
+                console.log(`Comment author ${commenter} has permission: ${level}`);
+              } catch (error) {
+                console.log(`Could not check permissions for ${commenter}: ${error.message}`);
+              }
+            }
+
+            if (!hasPermission) {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: `❌ @${commenter} You don't have permission to trigger pre-commit. Only PR authors or repository collaborators can run this command.`
+              });
+              core.setFailed(`User ${commenter} does not have permission`);
+              return;
+            }
+
+            // Save PR info for later steps
+            core.setOutput('pr_number', context.issue.number);
+            core.setOutput('pr_head_ref', pr.data.head.ref);
+            core.setOutput('pr_head_sha', pr.data.head.sha);
+            core.setOutput('pr_head_repo', pr.data.head.repo.full_name);
+            core.setOutput('pr_base_ref', pr.data.base.ref);
+            core.setOutput('is_fork', pr.data.head.repo.full_name !== context.payload.repository.full_name);
+            core.setOutput('authorized', 'true');
+
+      - name: React to comment
+        if: steps.check_author.outputs.authorized == 'true'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: context.payload.comment.id,
+              content: 'rocket'
+            });
+
+      - name: Comment starting
+        if: steps.check_author.outputs.authorized == 'true'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: ${{ steps.check_author.outputs.pr_number }},
+              body: `⏳ Running pre-commit hooks on PR #${{ steps.check_author.outputs.pr_number }}...`
+            });
+
+      - name: Checkout PR branch (same-repo)
+        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'false'
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          ref: ${{ steps.check_author.outputs.pr_head_ref }}
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout PR branch (fork)
+        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'true'
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: ${{ steps.check_author.outputs.pr_head_repo }}
+          ref: ${{ steps.check_author.outputs.pr_head_ref }}
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Verify checkout
+        if: steps.check_author.outputs.authorized == 'true'
+        run: |
+          echo "Current SHA: $(git rev-parse HEAD)"
+          echo "Expected SHA: ${{ steps.check_author.outputs.pr_head_sha }}"
+          if [[ "$(git rev-parse HEAD)" != "${{ steps.check_author.outputs.pr_head_sha }}" ]]; then
+            echo "::error::Checked out SHA does not match expected SHA"
+            exit 1
+          fi
+
+      - name: Set up Python
+        if: steps.check_author.outputs.authorized == 'true'
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.12'
+          cache: pip
+          cache-dependency-path: |
+            **/requirements*.txt
+            .pre-commit-config.yaml
+
+      - name: Set up Node.js
+        if: steps.check_author.outputs.authorized == 'true'
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        with:
+          node-version: '20'
+          cache: 'npm'
+          cache-dependency-path: 'llama_stack/ui/'
+
+      - name: Install npm dependencies
+        if: steps.check_author.outputs.authorized == 'true'
+        run: npm ci
+        working-directory: llama_stack/ui
+
+      - name: Run pre-commit
+        if: steps.check_author.outputs.authorized == 'true'
+        id: precommit
+        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+        continue-on-error: true
+        env:
+          SKIP: no-commit-to-branch
+          RUFF_OUTPUT_FORMAT: github
+
+      - name: Check for changes
+        if: steps.check_author.outputs.authorized == 'true'
+        id: changes
+        run: |
+          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
+            echo "has_changes=true" >> $GITHUB_OUTPUT
+            echo "Changes detected after pre-commit"
+          else
+            echo "has_changes=false" >> $GITHUB_OUTPUT
+            echo "No changes after pre-commit"
+          fi
+
+      - name: Commit and push changes
+        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
+        run: |
+          git config --local user.email "github-actions[bot]@users.noreply.github.com"
+          git config --local user.name "github-actions[bot]"
+
+          git add -A
+          git commit -m "style: apply pre-commit fixes
+
+          🤖 Applied by @github-actions bot via pre-commit workflow"
+
+          # Push changes
+          git push origin HEAD:${{ steps.check_author.outputs.pr_head_ref }}
+
+      - name: Comment success with changes
+        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: ${{ steps.check_author.outputs.pr_number }},
+              body: `✅ Pre-commit hooks completed successfully!\n\n🔧 Changes have been committed and pushed to the PR branch.`
+            });
+
+      - name: Comment success without changes
+        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'false' && steps.precommit.outcome == 'success'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: ${{ steps.check_author.outputs.pr_number }},
+              body: `✅ Pre-commit hooks passed!\n\n✨ No changes needed - your code is already formatted correctly.`
+            });
+
+      - name: Comment failure
+        if: failure()
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: ${{ steps.check_author.outputs.pr_number }},
+              body: `❌ Pre-commit workflow failed!\n\nPlease check the [workflow logs](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) for details.`
+            });
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -112,7 +112,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -150,7 +150,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

    - name: Install uv
-      uses: astral-sh/setup-uv@b75a909f75acd358c2196fb9a5f1299a9a8868a4 # v6.7.0
+      uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e # v6.8.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)

-[**Quick Start**](https://llamastack.github.io/latest/getting_started/index.html) | [**Documentation**](https://llamastack.github.io/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+[**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)


 ### ✨🎉 Llama 4 Support  🎉✨
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@ -187,21 +187,21 @@ Configure telemetry behavior using environment variables:
 - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
 - **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)

-## Visualization with Jaeger
+### Quick Setup: Complete Telemetry Stack

-The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector.
-
-### Starting Jaeger
-
-Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686:
+Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):

 ```bash
-docker run --pull always --rm --name jaeger \
-  -p 16686:16686 -p 4318:4318 \
-  jaegertracing/jaeger:2.1.0
+./scripts/telemetry/setup_telemetry.sh
 ```

-Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/).
+This sets up:
+- **Jaeger UI**: http://localhost:16686 (traces visualization)
+- **Prometheus**: http://localhost:9090 (metrics)
+- **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
+- **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
+
+Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.

 ## Querying Metrics

--- a/docs/docs/concepts/apis/external.mdx
+++ b/docs/docs/concepts/apis/external.mdx
@ -357,7 +357,7 @@ server:
 8. Run the server:

 ```bash
-python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
+llama stack run ~/.llama/run-byoa.yaml
 ```

 9. Test the API:
--- a/docs/docs/deploying/kubernetes_deployment.mdx
+++ b/docs/docs/deploying/kubernetes_deployment.mdx
@ -170,7 +170,7 @@ spec:
      - name: llama-stack
        image: localhost/llama-stack-run-k8s:latest
        imagePullPolicy: IfNotPresent
-        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"]
+        command: ["llama", "stack", "run", "/app/config.yaml"]
        ports:
          - containerPort: 5000
        volumeMounts:
--- a/docs/docs/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/docs/distributions/k8s/stack-k8s.yaml.template
@ -52,7 +52,7 @@ spec:
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        command: ["llama", "stack", "run", "/etc/config/stack_run_config.yaml", "--port", "8321"]
        ports:
          - containerPort: 8321
        volumeMounts:
--- a/docs/docs/providers/files/index.mdx
+++ b/docs/docs/providers/files/index.mdx
@ -1,4 +1,7 @@
 ---
+description: "Files
+
+    This API is used to upload documents that can be used with other Llama Stack APIs."
 sidebar_label: Files
 title: Files
 ---
@ -7,4 +10,8 @@ title: Files

 ## Overview

+Files
+
+    This API is used to upload documents that can be used with other Llama Stack APIs.
+
 This section contains documentation for all available providers for the **files** API.
--- a/docs/docs/providers/inference/index.mdx
+++ b/docs/docs/providers/inference/index.mdx
@ -1,5 +1,7 @@
 ---
-description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
+description: "Inference
+
+    Llama Stack Inference API for generating completions, chat completions, and embeddings.

    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
@ -12,7 +14,9 @@ title: Inference

 ## Overview

-Llama Stack Inference API for generating completions, chat completions, and embeddings.
+Inference
+
+    Llama Stack Inference API for generating completions, chat completions, and embeddings.

    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
--- a/docs/docs/providers/inference/remote_databricks.mdx
+++ b/docs/docs/providers/inference/remote_databricks.mdx
@ -15,7 +15,7 @@ Databricks inference provider for running models on Databricks' unified analytic
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
-| `url` | `<class 'str'>` | No |  | The URL for the Databricks model serving endpoint |
+| `url` | `str \| None` | No |  | The URL for the Databricks model serving endpoint |
 | `api_token` | `<class 'pydantic.types.SecretStr'>` | No |  | The Databricks API token |

 ## Sample Configuration
--- a/docs/docs/providers/safety/index.mdx
+++ b/docs/docs/providers/safety/index.mdx
@ -1,4 +1,7 @@
 ---
+description: "Safety
+
+    OpenAI-compatible Moderations API."
 sidebar_label: Safety
 title: Safety
 ---
@ -7,4 +10,8 @@ title: Safety

 ## Overview

+Safety
+
+    OpenAI-compatible Moderations API.
+
 This section contains documentation for all available providers for the **safety** API.
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -50,6 +50,7 @@ from .specification import (
    Document,
    Example,
    ExampleRef,
+    ExtraBodyParameter,
    MediaType,
    Operation,
    Parameter,
@ -677,6 +678,27 @@ class Generator:
        # parameters passed anywhere
        parameters = path_parameters + query_parameters

+        # Build extra body parameters documentation
+        extra_body_parameters = []
+        for param_name, param_type, description in op.extra_body_params:
+            if is_type_optional(param_type):
+                inner_type: type = unwrap_optional_type(param_type)
+                required = False
+            else:
+                inner_type = param_type
+                required = True
+
+            # Use description from ExtraBodyField if available, otherwise from docstring
+            param_description = description or doc_params.get(param_name)
+
+            extra_body_param = ExtraBodyParameter(
+                name=param_name,
+                schema=self.schema_builder.classdef_to_ref(inner_type),
+                description=param_description,
+                required=required,
+            )
+            extra_body_parameters.append(extra_body_param)
+
        webmethod = getattr(op.func_ref, "__webmethod__", None)
        raw_bytes_request_body = False
        if webmethod:
@ -898,6 +920,7 @@ class Generator:
            deprecated=getattr(op.webmethod, "deprecated", False)
            or "DEPRECATED" in op.func_name,
            security=[] if op.public else None,
+            extraBodyParameters=extra_body_parameters if extra_body_parameters else None,
        )

    def _get_api_stability_priority(self, api_level: str) -> int:
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@ -19,10 +19,12 @@ from llama_stack.strong_typing.inspection import get_signature

 from typing import get_origin, get_args

-from fastapi import UploadFile 
+from fastapi import UploadFile
 from fastapi.params import File, Form
 from typing import Annotated

+from llama_stack.schema_utils import ExtraBodyField
+

 def split_prefix(
    s: str, sep: str, prefix: Union[str, Iterable[str]]
@ -89,6 +91,7 @@ class EndpointOperation:
    :param query_params: Parameters of the operation signature that are passed in the query string as `key=value` pairs.
    :param request_params: The parameter that corresponds to the data transmitted in the request body.
    :param multipart_params: Parameters that indicate multipart/form-data request body.
+    :param extra_body_params: Parameters that arrive via extra_body and are documented but not in SDK.
    :param event_type: The Python type of the data that is transmitted out-of-band (e.g. via websockets) while the operation is in progress.
    :param response_type: The Python type of the data that is transmitted in the response body.
    :param http_method: The HTTP method used to invoke the endpoint such as POST, GET or PUT.
@ -106,6 +109,7 @@ class EndpointOperation:
    query_params: List[OperationParameter]
    request_params: Optional[OperationParameter]
    multipart_params: List[OperationParameter]
+    extra_body_params: List[tuple[str, type, str | None]]
    event_type: Optional[type]
    response_type: type
    http_method: HTTPMethod
@ -265,6 +269,7 @@ def get_endpoint_operations(
            query_params = []
            request_params = []
            multipart_params = []
+            extra_body_params = []

            for param_name, parameter in signature.parameters.items():
                param_type = _get_annotation_type(parameter.annotation, func_ref)
@ -279,6 +284,13 @@ def get_endpoint_operations(
                        f"parameter '{param_name}' in function '{func_name}' has no type annotation"
                    )

+                # Check if this is an extra_body parameter
+                is_extra_body, extra_body_desc = _is_extra_body_param(param_type)
+                if is_extra_body:
+                    # Store in a separate list for documentation
+                    extra_body_params.append((param_name, param_type, extra_body_desc))
+                    continue  # Skip adding to request_params
+
                is_multipart = _is_multipart_param(param_type)

                if prefix in ["get", "delete"]:
@ -351,6 +363,7 @@ def get_endpoint_operations(
                query_params=query_params,
                request_params=request_params,
                multipart_params=multipart_params,
+                extra_body_params=extra_body_params,
                event_type=event_type,
                response_type=response_type,
                http_method=http_method,
@ -403,7 +416,7 @@ def get_endpoint_events(endpoint: type) -> Dict[str, type]:
 def _is_multipart_param(param_type: type) -> bool:
    """
    Check if a parameter type indicates multipart form data.
-    
+
    Returns True if the type is:
    - UploadFile
    - Annotated[UploadFile, File()]
@ -413,19 +426,38 @@ def _is_multipart_param(param_type: type) -> bool:
    """
    if param_type is UploadFile:
        return True
-    
+
    # Check for Annotated types
    origin = get_origin(param_type)
    if origin is None:
        return False
-    
+
    if origin is Annotated:
        args = get_args(param_type)
        if len(args) < 2:
            return False
-        
+
        # Check the annotations for File() or Form()
        for annotation in args[1:]:
            if isinstance(annotation, (File, Form)):
                return True
    return False
+
+
+def _is_extra_body_param(param_type: type) -> tuple[bool, str | None]:
+    """
+    Check if parameter is marked as coming from extra_body.
+
+    Returns:
+        (is_extra_body, description): Tuple of boolean and optional description
+    """
+    origin = get_origin(param_type)
+    if origin is Annotated:
+        args = get_args(param_type)
+        for annotation in args[1:]:
+            if isinstance(annotation, ExtraBodyField):
+                return True, annotation.description
+            # Also check by type name for cases where import matters
+            if type(annotation).__name__ == 'ExtraBodyField':
+                return True, getattr(annotation, 'description', None)
+    return False, None
--- a/docs/openapi_generator/pyopenapi/specification.py
+++ b/docs/openapi_generator/pyopenapi/specification.py
@ -106,6 +106,15 @@ class Parameter:
    example: Optional[Any] = None


+@dataclass
+class ExtraBodyParameter:
+    """Represents a parameter that arrives via extra_body in the request."""
+    name: str
+    schema: SchemaOrRef
+    description: Optional[str] = None
+    required: Optional[bool] = None
+
+
@dataclass
 class Operation:
    responses: Dict[str, Union[Response, ResponseRef]]
@ -118,6 +127,7 @@ class Operation:
    callbacks: Optional[Dict[str, "Callback"]] = None
    security: Optional[List["SecurityRequirement"]] = None
    deprecated: Optional[bool] = None
+    extraBodyParameters: Optional[List[ExtraBodyParameter]] = None


@dataclass
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -52,6 +52,17 @@ class Specification:
                    if display_name:
                        tag["x-displayName"] = display_name

+            # Handle operations to rename extraBodyParameters -> x-llama-stack-extra-body-params
+            paths = json_doc.get("paths", {})
+            for path_item in paths.values():
+                if isinstance(path_item, dict):
+                    for method in ["get", "post", "put", "delete", "patch"]:
+                        operation = path_item.get(method)
+                        if operation and isinstance(operation, dict):
+                            extra_body_params = operation.pop("extraBodyParameters", None)
+                            if extra_body_params:
+                                operation["x-llama-stack-extra-body-params"] = extra_body_params
+
        return json_doc

    def get_json_string(self, pretty_print: bool = False) -> str:
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@ -1443,8 +1443,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "List all chat completions.",
-                "description": "List all chat completions.",
+                "summary": "List chat completions.",
+                "description": "List chat completions.",
                "parameters": [
                    {
                        "name": "after",
@ -1520,8 +1520,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
-                "description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
+                "summary": "Create chat completions.",
+                "description": "Create chat completions.\nGenerate an OpenAI-compatible chat completion for the given messages using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1565,8 +1565,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Describe a chat completion by its ID.",
-                "description": "Describe a chat completion by its ID.",
+                "summary": "Get chat completion.",
+                "description": "Get chat completion.\nDescribe a chat completion by its ID.",
                "parameters": [
                    {
                        "name": "completion_id",
@ -1610,8 +1610,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
-                "description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
+                "summary": "Create completion.",
+                "description": "Create completion.\nGenerate an OpenAI-compatible completion for the given prompt using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1655,8 +1655,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
-                "description": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
+                "summary": "Create embeddings.",
+                "description": "Create embeddings.\nGenerate OpenAI-compatible embeddings for the given input using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1700,8 +1700,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Returns a list of files that belong to the user's organization.",
-                "description": "Returns a list of files that belong to the user's organization.",
+                "summary": "List files.",
+                "description": "List files.\nReturns a list of files that belong to the user's organization.",
                "parameters": [
                    {
                        "name": "after",
@ -1770,8 +1770,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Upload a file that can be used across various endpoints.",
-                "description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
+                "summary": "Upload file.",
+                "description": "Upload file.\nUpload a file that can be used across various endpoints.\n\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1831,8 +1831,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Returns information about a specific file.",
-                "description": "Returns information about a specific file.",
+                "summary": "Retrieve file.",
+                "description": "Retrieve file.\nReturns information about a specific file.",
                "parameters": [
                    {
                        "name": "file_id",
@ -1874,8 +1874,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Delete a file.",
-                "description": "Delete a file.",
+                "summary": "Delete file.",
+                "description": "Delete file.",
                "parameters": [
                    {
                        "name": "file_id",
@ -1919,8 +1919,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Returns the contents of the specified file.",
-                "description": "Returns the contents of the specified file.",
+                "summary": "Retrieve file content.",
+                "description": "Retrieve file content.\nReturns the contents of the specified file.",
                "parameters": [
                    {
                        "name": "file_id",
@ -1999,8 +1999,8 @@
                "tags": [
                    "Safety"
                ],
-                "summary": "Classifies if text and/or image inputs are potentially harmful.",
-                "description": "Classifies if text and/or image inputs are potentially harmful.",
+                "summary": "Create moderation.",
+                "description": "Create moderation.\nClassifies if text and/or image inputs are potentially harmful.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -2044,8 +2044,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "List all OpenAI responses.",
-                "description": "List all OpenAI responses.",
+                "summary": "List all responses.",
+                "description": "List all responses.",
                "parameters": [
                    {
                        "name": "after",
@ -2119,8 +2119,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "Create a new OpenAI response.",
-                "description": "Create a new OpenAI response.",
+                "summary": "Create a model response.",
+                "description": "Create a model response.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -2132,7 +2132,27 @@
                    },
                    "required": true
                },
-                "deprecated": true
+                "deprecated": true,
+                "x-llama-stack-extra-body-params": [
+                    {
+                        "name": "shields",
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "oneOf": [
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "$ref": "#/components/schemas/ResponseShieldSpec"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "List of shields to apply during response generation. Shields provide safety and content moderation.",
+                        "required": false
+                    }
+                ]
            }
        },
        "/v1/openai/v1/responses/{response_id}": {
@ -2164,8 +2184,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "Retrieve an OpenAI response by its ID.",
-                "description": "Retrieve an OpenAI response by its ID.",
+                "summary": "Get a model response.",
+                "description": "Get a model response.",
                "parameters": [
                    {
                        "name": "response_id",
@ -2207,8 +2227,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "Delete an OpenAI response by its ID.",
-                "description": "Delete an OpenAI response by its ID.",
+                "summary": "Delete a response.",
+                "description": "Delete a response.",
                "parameters": [
                    {
                        "name": "response_id",
@ -2252,8 +2272,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "List input items for a given OpenAI response.",
-                "description": "List input items for a given OpenAI response.",
+                "summary": "List input items.",
+                "description": "List input items.",
                "parameters": [
                    {
                        "name": "response_id",
@ -9521,6 +9541,21 @@
                "title": "OpenAIResponseText",
                "description": "Text response configuration for OpenAI responses."
            },
+            "ResponseShieldSpec": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "description": "The type/identifier of the shield."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ResponseShieldSpec",
+                "description": "Specification for a shield to apply during response generation."
+            },
            "OpenAIResponseInputTool": {
                "oneOf": [
                    {
@ -13331,12 +13366,13 @@
        },
        {
            "name": "Files",
-            "description": ""
+            "description": "This API is used to upload documents that can be used with other Llama Stack APIs.",
+            "x-displayName": "Files"
        },
        {
            "name": "Inference",
-            "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
-            "x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings."
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
+            "x-displayName": "Inference"
        },
        {
            "name": "Models",
@ -13348,7 +13384,8 @@
        },
        {
            "name": "Safety",
-            "description": ""
+            "description": "OpenAI-compatible Moderations API.",
+            "x-displayName": "Safety"
        },
        {
            "name": "Telemetry",
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -1033,8 +1033,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: List all chat completions.
-      description: List all chat completions.
+      summary: List chat completions.
+      description: List chat completions.
      parameters:
        - name: after
          in: query
@ -1087,10 +1087,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: >-
-        Generate an OpenAI-compatible chat completion for the given messages using
-        the specified model.
+      summary: Create chat completions.
      description: >-
+        Create chat completions.
+
        Generate an OpenAI-compatible chat completion for the given messages using
        the specified model.
      parameters: []
@ -1122,8 +1122,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: Describe a chat completion by its ID.
-      description: Describe a chat completion by its ID.
+      summary: Get chat completion.
+      description: >-
+        Get chat completion.
+
+        Describe a chat completion by its ID.
      parameters:
        - name: completion_id
          in: path
@ -1153,10 +1156,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: >-
-        Generate an OpenAI-compatible completion for the given prompt using the specified
-        model.
+      summary: Create completion.
      description: >-
+        Create completion.
+
        Generate an OpenAI-compatible completion for the given prompt using the specified
        model.
      parameters: []
@ -1189,10 +1192,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: >-
-        Generate OpenAI-compatible embeddings for the given input using the specified
-        model.
+      summary: Create embeddings.
      description: >-
+        Create embeddings.
+
        Generate OpenAI-compatible embeddings for the given input using the specified
        model.
      parameters: []
@ -1225,9 +1228,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Returns a list of files that belong to the user's organization.
+      summary: List files.
      description: >-
+        List files.
+
        Returns a list of files that belong to the user's organization.
      parameters:
        - name: after
@ -1285,11 +1289,13 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Upload a file that can be used across various endpoints.
+      summary: Upload file.
      description: >-
+        Upload file.
+
        Upload a file that can be used across various endpoints.

+
        The file upload should be a multipart form request with:

        - file: The File object (not file name) to be uploaded.
@ -1338,9 +1344,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Returns information about a specific file.
+      summary: Retrieve file.
      description: >-
+        Retrieve file.
+
        Returns information about a specific file.
      parameters:
        - name: file_id
@ -1372,8 +1379,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: Delete a file.
-      description: Delete a file.
+      summary: Delete file.
+      description: Delete file.
      parameters:
        - name: file_id
          in: path
@ -1405,9 +1412,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Returns the contents of the specified file.
+      summary: Retrieve file content.
      description: >-
+        Retrieve file content.
+
        Returns the contents of the specified file.
      parameters:
        - name: file_id
@ -1464,9 +1472,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Safety
-      summary: >-
-        Classifies if text and/or image inputs are potentially harmful.
+      summary: Create moderation.
      description: >-
+        Create moderation.
+
        Classifies if text and/or image inputs are potentially harmful.
      parameters: []
      requestBody:
@ -1497,8 +1506,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: List all OpenAI responses.
-      description: List all OpenAI responses.
+      summary: List all responses.
+      description: List all responses.
      parameters:
        - name: after
          in: query
@ -1549,8 +1558,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: Create a new OpenAI response.
-      description: Create a new OpenAI response.
+      summary: Create a model response.
+      description: Create a model response.
      parameters: []
      requestBody:
        content:
@ -1559,6 +1568,18 @@ paths:
              $ref: '#/components/schemas/CreateOpenaiResponseRequest'
        required: true
      deprecated: true
+      x-llama-stack-extra-body-params:
+        - name: shields
+          schema:
+            type: array
+            items:
+              oneOf:
+                - type: string
+                - $ref: '#/components/schemas/ResponseShieldSpec'
+          description: >-
+            List of shields to apply during response generation. Shields provide safety
+            and content moderation.
+          required: false
  /v1/openai/v1/responses/{response_id}:
    get:
      responses:
@ -1580,8 +1601,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: Retrieve an OpenAI response by its ID.
-      description: Retrieve an OpenAI response by its ID.
+      summary: Get a model response.
+      description: Get a model response.
      parameters:
        - name: response_id
          in: path
@ -1611,8 +1632,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: Delete an OpenAI response by its ID.
-      description: Delete an OpenAI response by its ID.
+      summary: Delete a response.
+      description: Delete a response.
      parameters:
        - name: response_id
          in: path
@ -1642,10 +1663,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: >-
-        List input items for a given OpenAI response.
-      description: >-
-        List input items for a given OpenAI response.
+      summary: List input items.
+      description: List input items.
      parameters:
        - name: response_id
          in: path
@ -7076,6 +7095,18 @@ components:
      title: OpenAIResponseText
      description: >-
        Text response configuration for OpenAI responses.
+    ResponseShieldSpec:
+      type: object
+      properties:
+        type:
+          type: string
+          description: The type/identifier of the shield.
+      additionalProperties: false
+      required:
+        - type
+      title: ResponseShieldSpec
+      description: >-
+        Specification for a shield to apply during response generation.
    OpenAIResponseInputTool:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -9987,9 +10018,16 @@ tags:
    x-displayName: >-
      Llama Stack Evaluation API for running evaluations on model and agent candidates.
  - name: Files
-    description: ''
+    description: >-
+      This API is used to upload documents that can be used with other Llama Stack
+      APIs.
+    x-displayName: Files
  - name: Inference
    description: >-
+      Llama Stack Inference API for generating completions, chat completions, and
+      embeddings.
+
+
      This API provides the raw interface to the underlying models. Two kinds of models
      are supported:

@ -9997,15 +10035,14 @@ tags:

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-    x-displayName: >-
-      Llama Stack Inference API for generating completions, chat completions, and
-      embeddings.
+    x-displayName: Inference
  - name: Models
    description: ''
  - name: PostTraining (Coming Soon)
    description: ''
  - name: Safety
-    description: ''
+    description: OpenAI-compatible Moderations API.
+    x-displayName: Safety
  - name: Telemetry
    description: ''
  - name: VectorIO
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -69,8 +69,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "List all chat completions.",
-                "description": "List all chat completions.",
+                "summary": "List chat completions.",
+                "description": "List chat completions.",
                "parameters": [
                    {
                        "name": "after",
@ -146,8 +146,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
-                "description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
+                "summary": "Create chat completions.",
+                "description": "Create chat completions.\nGenerate an OpenAI-compatible chat completion for the given messages using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -191,8 +191,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Describe a chat completion by its ID.",
-                "description": "Describe a chat completion by its ID.",
+                "summary": "Get chat completion.",
+                "description": "Get chat completion.\nDescribe a chat completion by its ID.",
                "parameters": [
                    {
                        "name": "completion_id",
@ -236,8 +236,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
-                "description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
+                "summary": "Create completion.",
+                "description": "Create completion.\nGenerate an OpenAI-compatible completion for the given prompt using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -758,8 +758,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
-                "description": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
+                "summary": "Create embeddings.",
+                "description": "Create embeddings.\nGenerate OpenAI-compatible embeddings for the given input using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -803,8 +803,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Returns a list of files that belong to the user's organization.",
-                "description": "Returns a list of files that belong to the user's organization.",
+                "summary": "List files.",
+                "description": "List files.\nReturns a list of files that belong to the user's organization.",
                "parameters": [
                    {
                        "name": "after",
@ -873,8 +873,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Upload a file that can be used across various endpoints.",
-                "description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
+                "summary": "Upload file.",
+                "description": "Upload file.\nUpload a file that can be used across various endpoints.\n\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -934,8 +934,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Returns information about a specific file.",
-                "description": "Returns information about a specific file.",
+                "summary": "Retrieve file.",
+                "description": "Retrieve file.\nReturns information about a specific file.",
                "parameters": [
                    {
                        "name": "file_id",
@ -977,8 +977,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Delete a file.",
-                "description": "Delete a file.",
+                "summary": "Delete file.",
+                "description": "Delete file.",
                "parameters": [
                    {
                        "name": "file_id",
@ -1022,8 +1022,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Returns the contents of the specified file.",
-                "description": "Returns the contents of the specified file.",
+                "summary": "Retrieve file content.",
+                "description": "Retrieve file content.\nReturns the contents of the specified file.",
                "parameters": [
                    {
                        "name": "file_id",
@ -1067,8 +1067,8 @@
                "tags": [
                    "Inspect"
                ],
-                "summary": "Get the current health status of the service.",
-                "description": "Get the current health status of the service.",
+                "summary": "Get health status.",
+                "description": "Get health status.\nGet the current health status of the service.",
                "parameters": [],
                "deprecated": false
            }
@ -1102,8 +1102,8 @@
                "tags": [
                    "Inspect"
                ],
-                "summary": "List all available API routes with their methods and implementing providers.",
-                "description": "List all available API routes with their methods and implementing providers.",
+                "summary": "List routes.",
+                "description": "List routes.\nList all available API routes with their methods and implementing providers.",
                "parameters": [],
                "deprecated": false
            }
@ -1170,8 +1170,8 @@
                "tags": [
                    "Models"
                ],
-                "summary": "Register a model.",
-                "description": "Register a model.",
+                "summary": "Register model.",
+                "description": "Register model.\nRegister a model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1215,8 +1215,8 @@
                "tags": [
                    "Models"
                ],
-                "summary": "Get a model by its identifier.",
-                "description": "Get a model by its identifier.",
+                "summary": "Get model.",
+                "description": "Get model.\nGet a model by its identifier.",
                "parameters": [
                    {
                        "name": "model_id",
@ -1251,8 +1251,8 @@
                "tags": [
                    "Models"
                ],
-                "summary": "Unregister a model.",
-                "description": "Unregister a model.",
+                "summary": "Unregister model.",
+                "description": "Unregister model.\nUnregister a model.",
                "parameters": [
                    {
                        "name": "model_id",
@ -1296,8 +1296,8 @@
                "tags": [
                    "Safety"
                ],
-                "summary": "Classifies if text and/or image inputs are potentially harmful.",
-                "description": "Classifies if text and/or image inputs are potentially harmful.",
+                "summary": "Create moderation.",
+                "description": "Create moderation.\nClassifies if text and/or image inputs are potentially harmful.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1374,8 +1374,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Create a new prompt.",
-                "description": "Create a new prompt.",
+                "summary": "Create prompt.",
+                "description": "Create prompt.\nCreate a new prompt.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1419,8 +1419,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Get a prompt by its identifier and optional version.",
-                "description": "Get a prompt by its identifier and optional version.",
+                "summary": "Get prompt.",
+                "description": "Get prompt.\nGet a prompt by its identifier and optional version.",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1471,8 +1471,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Update an existing prompt (increments version).",
-                "description": "Update an existing prompt (increments version).",
+                "summary": "Update prompt.",
+                "description": "Update prompt.\nUpdate an existing prompt (increments version).",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1517,8 +1517,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Delete a prompt.",
-                "description": "Delete a prompt.",
+                "summary": "Delete prompt.",
+                "description": "Delete prompt.\nDelete a prompt.",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1562,8 +1562,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Set which version of a prompt should be the default in get_prompt (latest).",
-                "description": "Set which version of a prompt should be the default in get_prompt (latest).",
+                "summary": "Set prompt version.",
+                "description": "Set prompt version.\nSet which version of a prompt should be the default in get_prompt (latest).",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1617,8 +1617,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "List all versions of a specific prompt.",
-                "description": "List all versions of a specific prompt.",
+                "summary": "List prompt versions.",
+                "description": "List prompt versions.\nList all versions of a specific prompt.",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1662,8 +1662,8 @@
                "tags": [
                    "Providers"
                ],
-                "summary": "List all available providers.",
-                "description": "List all available providers.",
+                "summary": "List providers.",
+                "description": "List providers.\nList all available providers.",
                "parameters": [],
                "deprecated": false
            }
@ -1697,8 +1697,8 @@
                "tags": [
                    "Providers"
                ],
-                "summary": "Get detailed information about a specific provider.",
-                "description": "Get detailed information about a specific provider.",
+                "summary": "Get provider.",
+                "description": "Get provider.\nGet detailed information about a specific provider.",
                "parameters": [
                    {
                        "name": "provider_id",
@ -1742,8 +1742,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "List all OpenAI responses.",
-                "description": "List all OpenAI responses.",
+                "summary": "List all responses.",
+                "description": "List all responses.",
                "parameters": [
                    {
                        "name": "after",
@ -1817,8 +1817,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "Create a new OpenAI response.",
-                "description": "Create a new OpenAI response.",
+                "summary": "Create a model response.",
+                "description": "Create a model response.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1830,7 +1830,27 @@
                    },
                    "required": true
                },
-                "deprecated": false
+                "deprecated": false,
+                "x-llama-stack-extra-body-params": [
+                    {
+                        "name": "shields",
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "oneOf": [
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "$ref": "#/components/schemas/ResponseShieldSpec"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "List of shields to apply during response generation. Shields provide safety and content moderation.",
+                        "required": false
+                    }
+                ]
            }
        },
        "/v1/responses/{response_id}": {
@ -1862,8 +1882,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "Retrieve an OpenAI response by its ID.",
-                "description": "Retrieve an OpenAI response by its ID.",
+                "summary": "Get a model response.",
+                "description": "Get a model response.",
                "parameters": [
                    {
                        "name": "response_id",
@ -1905,8 +1925,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "Delete an OpenAI response by its ID.",
-                "description": "Delete an OpenAI response by its ID.",
+                "summary": "Delete a response.",
+                "description": "Delete a response.",
                "parameters": [
                    {
                        "name": "response_id",
@ -1950,8 +1970,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "List input items for a given OpenAI response.",
-                "description": "List input items for a given OpenAI response.",
+                "summary": "List input items.",
+                "description": "List input items.",
                "parameters": [
                    {
                        "name": "response_id",
@ -2043,8 +2063,8 @@
                "tags": [
                    "Safety"
                ],
-                "summary": "Run a shield.",
-                "description": "Run a shield.",
+                "summary": "Run shield.",
+                "description": "Run shield.\nRun a shield.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -4176,8 +4196,8 @@
                "tags": [
                    "Inspect"
                ],
-                "summary": "Get the version of the service.",
-                "description": "Get the version of the service.",
+                "summary": "Get version.",
+                "description": "Get version.\nGet the version of the service.",
                "parameters": [],
                "deprecated": false
            }
@ -7616,6 +7636,21 @@
                "title": "OpenAIResponseText",
                "description": "Text response configuration for OpenAI responses."
            },
+            "ResponseShieldSpec": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "description": "The type/identifier of the shield."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ResponseShieldSpec",
+                "description": "Specification for a shield to apply during response generation."
+            },
            "OpenAIResponseInputTool": {
                "oneOf": [
                    {
@ -12879,16 +12914,18 @@
        },
        {
            "name": "Files",
-            "description": ""
+            "description": "This API is used to upload documents that can be used with other Llama Stack APIs.",
+            "x-displayName": "Files"
        },
        {
            "name": "Inference",
-            "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
-            "x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings."
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
+            "x-displayName": "Inference"
        },
        {
            "name": "Inspect",
-            "description": ""
+            "description": "APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.",
+            "x-displayName": "Inspect"
        },
        {
            "name": "Models",
@ -12896,17 +12933,18 @@
        },
        {
            "name": "Prompts",
-            "description": "",
-            "x-displayName": "Protocol for prompt management operations."
+            "description": "Protocol for prompt management operations.",
+            "x-displayName": "Prompts"
        },
        {
            "name": "Providers",
-            "description": "",
-            "x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
+            "description": "Providers API for inspecting, listing, and modifying providers and their configurations.",
+            "x-displayName": "Providers"
        },
        {
            "name": "Safety",
-            "description": ""
+            "description": "OpenAI-compatible Moderations API.",
+            "x-displayName": "Safety"
        },
        {
            "name": "Scoring",
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -33,8 +33,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: List all chat completions.
-      description: List all chat completions.
+      summary: List chat completions.
+      description: List chat completions.
      parameters:
        - name: after
          in: query
@ -87,10 +87,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: >-
-        Generate an OpenAI-compatible chat completion for the given messages using
-        the specified model.
+      summary: Create chat completions.
      description: >-
+        Create chat completions.
+
        Generate an OpenAI-compatible chat completion for the given messages using
        the specified model.
      parameters: []
@ -122,8 +122,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: Describe a chat completion by its ID.
-      description: Describe a chat completion by its ID.
+      summary: Get chat completion.
+      description: >-
+        Get chat completion.
+
+        Describe a chat completion by its ID.
      parameters:
        - name: completion_id
          in: path
@ -153,10 +156,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: >-
-        Generate an OpenAI-compatible completion for the given prompt using the specified
-        model.
+      summary: Create completion.
      description: >-
+        Create completion.
+
        Generate an OpenAI-compatible completion for the given prompt using the specified
        model.
      parameters: []
@ -603,10 +606,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: >-
-        Generate OpenAI-compatible embeddings for the given input using the specified
-        model.
+      summary: Create embeddings.
      description: >-
+        Create embeddings.
+
        Generate OpenAI-compatible embeddings for the given input using the specified
        model.
      parameters: []
@ -639,9 +642,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Returns a list of files that belong to the user's organization.
+      summary: List files.
      description: >-
+        List files.
+
        Returns a list of files that belong to the user's organization.
      parameters:
        - name: after
@ -699,11 +703,13 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Upload a file that can be used across various endpoints.
+      summary: Upload file.
      description: >-
+        Upload file.
+
        Upload a file that can be used across various endpoints.

+
        The file upload should be a multipart form request with:

        - file: The File object (not file name) to be uploaded.
@ -752,9 +758,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Returns information about a specific file.
+      summary: Retrieve file.
      description: >-
+        Retrieve file.
+
        Returns information about a specific file.
      parameters:
        - name: file_id
@ -786,8 +793,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: Delete a file.
-      description: Delete a file.
+      summary: Delete file.
+      description: Delete file.
      parameters:
        - name: file_id
          in: path
@ -819,9 +826,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Returns the contents of the specified file.
+      summary: Retrieve file content.
      description: >-
+        Retrieve file content.
+
        Returns the contents of the specified file.
      parameters:
        - name: file_id
@ -854,9 +862,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inspect
-      summary: >-
-        Get the current health status of the service.
+      summary: Get health status.
      description: >-
+        Get health status.
+
        Get the current health status of the service.
      parameters: []
      deprecated: false
@ -882,9 +891,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inspect
-      summary: >-
-        List all available API routes with their methods and implementing providers.
+      summary: List routes.
      description: >-
+        List routes.
+
        List all available API routes with their methods and implementing providers.
      parameters: []
      deprecated: false
@ -933,8 +943,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
-      summary: Register a model.
-      description: Register a model.
+      summary: Register model.
+      description: >-
+        Register model.
+
+        Register a model.
      parameters: []
      requestBody:
        content:
@ -964,8 +977,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
-      summary: Get a model by its identifier.
-      description: Get a model by its identifier.
+      summary: Get model.
+      description: >-
+        Get model.
+
+        Get a model by its identifier.
      parameters:
        - name: model_id
          in: path
@ -990,8 +1006,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
-      summary: Unregister a model.
-      description: Unregister a model.
+      summary: Unregister model.
+      description: >-
+        Unregister model.
+
+        Unregister a model.
      parameters:
        - name: model_id
          in: path
@ -1022,9 +1041,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Safety
-      summary: >-
-        Classifies if text and/or image inputs are potentially harmful.
+      summary: Create moderation.
      description: >-
+        Create moderation.
+
        Classifies if text and/or image inputs are potentially harmful.
      parameters: []
      requestBody:
@ -1080,8 +1100,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: Create a new prompt.
-      description: Create a new prompt.
+      summary: Create prompt.
+      description: >-
+        Create prompt.
+
+        Create a new prompt.
      parameters: []
      requestBody:
        content:
@ -1111,9 +1134,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: >-
-        Get a prompt by its identifier and optional version.
+      summary: Get prompt.
      description: >-
+        Get prompt.
+
        Get a prompt by its identifier and optional version.
      parameters:
        - name: prompt_id
@ -1151,9 +1175,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: >-
-        Update an existing prompt (increments version).
+      summary: Update prompt.
      description: >-
+        Update prompt.
+
        Update an existing prompt (increments version).
      parameters:
        - name: prompt_id
@ -1185,8 +1210,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: Delete a prompt.
-      description: Delete a prompt.
+      summary: Delete prompt.
+      description: >-
+        Delete prompt.
+
+        Delete a prompt.
      parameters:
        - name: prompt_id
          in: path
@ -1217,9 +1245,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: >-
-        Set which version of a prompt should be the default in get_prompt (latest).
+      summary: Set prompt version.
      description: >-
+        Set prompt version.
+
        Set which version of a prompt should be the default in get_prompt (latest).
      parameters:
        - name: prompt_id
@ -1257,8 +1286,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: List all versions of a specific prompt.
-      description: List all versions of a specific prompt.
+      summary: List prompt versions.
+      description: >-
+        List prompt versions.
+
+        List all versions of a specific prompt.
      parameters:
        - name: prompt_id
          in: path
@ -1290,8 +1322,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Providers
-      summary: List all available providers.
-      description: List all available providers.
+      summary: List providers.
+      description: >-
+        List providers.
+
+        List all available providers.
      parameters: []
      deprecated: false
  /v1/providers/{provider_id}:
@ -1316,9 +1351,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Providers
-      summary: >-
-        Get detailed information about a specific provider.
+      summary: Get provider.
      description: >-
+        Get provider.
+
        Get detailed information about a specific provider.
      parameters:
        - name: provider_id
@ -1349,8 +1385,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: List all OpenAI responses.
-      description: List all OpenAI responses.
+      summary: List all responses.
+      description: List all responses.
      parameters:
        - name: after
          in: query
@ -1401,8 +1437,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: Create a new OpenAI response.
-      description: Create a new OpenAI response.
+      summary: Create a model response.
+      description: Create a model response.
      parameters: []
      requestBody:
        content:
@ -1411,6 +1447,18 @@ paths:
              $ref: '#/components/schemas/CreateOpenaiResponseRequest'
        required: true
      deprecated: false
+      x-llama-stack-extra-body-params:
+        - name: shields
+          schema:
+            type: array
+            items:
+              oneOf:
+                - type: string
+                - $ref: '#/components/schemas/ResponseShieldSpec'
+          description: >-
+            List of shields to apply during response generation. Shields provide safety
+            and content moderation.
+          required: false
  /v1/responses/{response_id}:
    get:
      responses:
@ -1432,8 +1480,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: Retrieve an OpenAI response by its ID.
-      description: Retrieve an OpenAI response by its ID.
+      summary: Get a model response.
+      description: Get a model response.
      parameters:
        - name: response_id
          in: path
@ -1463,8 +1511,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: Delete an OpenAI response by its ID.
-      description: Delete an OpenAI response by its ID.
+      summary: Delete a response.
+      description: Delete a response.
      parameters:
        - name: response_id
          in: path
@ -1494,10 +1542,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: >-
-        List input items for a given OpenAI response.
-      description: >-
-        List input items for a given OpenAI response.
+      summary: List input items.
+      description: List input items.
      parameters:
        - name: response_id
          in: path
@ -1566,8 +1612,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Safety
-      summary: Run a shield.
-      description: Run a shield.
+      summary: Run shield.
+      description: >-
+        Run shield.
+
+        Run a shield.
      parameters: []
      requestBody:
        content:
@ -3123,8 +3172,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inspect
-      summary: Get the version of the service.
-      description: Get the version of the service.
+      summary: Get version.
+      description: >-
+        Get version.
+
+        Get the version of the service.
      parameters: []
      deprecated: false
 jsonSchemaDialect: >-
@ -5739,6 +5791,18 @@ components:
      title: OpenAIResponseText
      description: >-
        Text response configuration for OpenAI responses.
+    ResponseShieldSpec:
+      type: object
+      properties:
+        type:
+          type: string
+          description: The type/identifier of the shield.
+      additionalProperties: false
+      required:
+        - type
+      title: ResponseShieldSpec
+      description: >-
+        Specification for a shield to apply during response generation.
    OpenAIResponseInputTool:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -9725,9 +9789,16 @@ tags:
    x-displayName: >-
      Protocol for conversation management operations.
  - name: Files
-    description: ''
+    description: >-
+      This API is used to upload documents that can be used with other Llama Stack
+      APIs.
+    x-displayName: Files
  - name: Inference
    description: >-
+      Llama Stack Inference API for generating completions, chat completions, and
+      embeddings.
+
+
      This API provides the raw interface to the underlying models. Two kinds of models
      are supported:

@ -9735,23 +9806,25 @@ tags:

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-    x-displayName: >-
-      Llama Stack Inference API for generating completions, chat completions, and
-      embeddings.
+    x-displayName: Inference
  - name: Inspect
-    description: ''
+    description: >-
+      APIs for inspecting the Llama Stack service, including health status, available
+      API routes with methods and implementing providers.
+    x-displayName: Inspect
  - name: Models
    description: ''
  - name: Prompts
-    description: ''
-    x-displayName: >-
+    description: >-
      Protocol for prompt management operations.
+    x-displayName: Prompts
  - name: Providers
-    description: ''
-    x-displayName: >-
+    description: >-
      Providers API for inspecting, listing, and modifying providers and their configurations.
+    x-displayName: Providers
  - name: Safety
-    description: ''
+    description: OpenAI-compatible Moderations API.
+    x-displayName: Safety
  - name: Scoring
    description: ''
  - name: ScoringFunctions
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@ -69,8 +69,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "List all chat completions.",
-                "description": "List all chat completions.",
+                "summary": "List chat completions.",
+                "description": "List chat completions.",
                "parameters": [
                    {
                        "name": "after",
@ -146,8 +146,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
-                "description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
+                "summary": "Create chat completions.",
+                "description": "Create chat completions.\nGenerate an OpenAI-compatible chat completion for the given messages using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -191,8 +191,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Describe a chat completion by its ID.",
-                "description": "Describe a chat completion by its ID.",
+                "summary": "Get chat completion.",
+                "description": "Get chat completion.\nDescribe a chat completion by its ID.",
                "parameters": [
                    {
                        "name": "completion_id",
@ -236,8 +236,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
-                "description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
+                "summary": "Create completion.",
+                "description": "Create completion.\nGenerate an OpenAI-compatible completion for the given prompt using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -758,8 +758,8 @@
                "tags": [
                    "Inference"
                ],
-                "summary": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
-                "description": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
+                "summary": "Create embeddings.",
+                "description": "Create embeddings.\nGenerate OpenAI-compatible embeddings for the given input using the specified model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -803,8 +803,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Returns a list of files that belong to the user's organization.",
-                "description": "Returns a list of files that belong to the user's organization.",
+                "summary": "List files.",
+                "description": "List files.\nReturns a list of files that belong to the user's organization.",
                "parameters": [
                    {
                        "name": "after",
@ -873,8 +873,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Upload a file that can be used across various endpoints.",
-                "description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
+                "summary": "Upload file.",
+                "description": "Upload file.\nUpload a file that can be used across various endpoints.\n\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -934,8 +934,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Returns information about a specific file.",
-                "description": "Returns information about a specific file.",
+                "summary": "Retrieve file.",
+                "description": "Retrieve file.\nReturns information about a specific file.",
                "parameters": [
                    {
                        "name": "file_id",
@ -977,8 +977,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Delete a file.",
-                "description": "Delete a file.",
+                "summary": "Delete file.",
+                "description": "Delete file.",
                "parameters": [
                    {
                        "name": "file_id",
@ -1022,8 +1022,8 @@
                "tags": [
                    "Files"
                ],
-                "summary": "Returns the contents of the specified file.",
-                "description": "Returns the contents of the specified file.",
+                "summary": "Retrieve file content.",
+                "description": "Retrieve file content.\nReturns the contents of the specified file.",
                "parameters": [
                    {
                        "name": "file_id",
@ -1067,8 +1067,8 @@
                "tags": [
                    "Inspect"
                ],
-                "summary": "Get the current health status of the service.",
-                "description": "Get the current health status of the service.",
+                "summary": "Get health status.",
+                "description": "Get health status.\nGet the current health status of the service.",
                "parameters": [],
                "deprecated": false
            }
@ -1102,8 +1102,8 @@
                "tags": [
                    "Inspect"
                ],
-                "summary": "List all available API routes with their methods and implementing providers.",
-                "description": "List all available API routes with their methods and implementing providers.",
+                "summary": "List routes.",
+                "description": "List routes.\nList all available API routes with their methods and implementing providers.",
                "parameters": [],
                "deprecated": false
            }
@ -1170,8 +1170,8 @@
                "tags": [
                    "Models"
                ],
-                "summary": "Register a model.",
-                "description": "Register a model.",
+                "summary": "Register model.",
+                "description": "Register model.\nRegister a model.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1215,8 +1215,8 @@
                "tags": [
                    "Models"
                ],
-                "summary": "Get a model by its identifier.",
-                "description": "Get a model by its identifier.",
+                "summary": "Get model.",
+                "description": "Get model.\nGet a model by its identifier.",
                "parameters": [
                    {
                        "name": "model_id",
@ -1251,8 +1251,8 @@
                "tags": [
                    "Models"
                ],
-                "summary": "Unregister a model.",
-                "description": "Unregister a model.",
+                "summary": "Unregister model.",
+                "description": "Unregister model.\nUnregister a model.",
                "parameters": [
                    {
                        "name": "model_id",
@ -1296,8 +1296,8 @@
                "tags": [
                    "Safety"
                ],
-                "summary": "Classifies if text and/or image inputs are potentially harmful.",
-                "description": "Classifies if text and/or image inputs are potentially harmful.",
+                "summary": "Create moderation.",
+                "description": "Create moderation.\nClassifies if text and/or image inputs are potentially harmful.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1374,8 +1374,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Create a new prompt.",
-                "description": "Create a new prompt.",
+                "summary": "Create prompt.",
+                "description": "Create prompt.\nCreate a new prompt.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1419,8 +1419,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Get a prompt by its identifier and optional version.",
-                "description": "Get a prompt by its identifier and optional version.",
+                "summary": "Get prompt.",
+                "description": "Get prompt.\nGet a prompt by its identifier and optional version.",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1471,8 +1471,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Update an existing prompt (increments version).",
-                "description": "Update an existing prompt (increments version).",
+                "summary": "Update prompt.",
+                "description": "Update prompt.\nUpdate an existing prompt (increments version).",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1517,8 +1517,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Delete a prompt.",
-                "description": "Delete a prompt.",
+                "summary": "Delete prompt.",
+                "description": "Delete prompt.\nDelete a prompt.",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1562,8 +1562,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "Set which version of a prompt should be the default in get_prompt (latest).",
-                "description": "Set which version of a prompt should be the default in get_prompt (latest).",
+                "summary": "Set prompt version.",
+                "description": "Set prompt version.\nSet which version of a prompt should be the default in get_prompt (latest).",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1617,8 +1617,8 @@
                "tags": [
                    "Prompts"
                ],
-                "summary": "List all versions of a specific prompt.",
-                "description": "List all versions of a specific prompt.",
+                "summary": "List prompt versions.",
+                "description": "List prompt versions.\nList all versions of a specific prompt.",
                "parameters": [
                    {
                        "name": "prompt_id",
@ -1662,8 +1662,8 @@
                "tags": [
                    "Providers"
                ],
-                "summary": "List all available providers.",
-                "description": "List all available providers.",
+                "summary": "List providers.",
+                "description": "List providers.\nList all available providers.",
                "parameters": [],
                "deprecated": false
            }
@ -1697,8 +1697,8 @@
                "tags": [
                    "Providers"
                ],
-                "summary": "Get detailed information about a specific provider.",
-                "description": "Get detailed information about a specific provider.",
+                "summary": "Get provider.",
+                "description": "Get provider.\nGet detailed information about a specific provider.",
                "parameters": [
                    {
                        "name": "provider_id",
@ -1742,8 +1742,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "List all OpenAI responses.",
-                "description": "List all OpenAI responses.",
+                "summary": "List all responses.",
+                "description": "List all responses.",
                "parameters": [
                    {
                        "name": "after",
@ -1817,8 +1817,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "Create a new OpenAI response.",
-                "description": "Create a new OpenAI response.",
+                "summary": "Create a model response.",
+                "description": "Create a model response.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -1830,7 +1830,27 @@
                    },
                    "required": true
                },
-                "deprecated": false
+                "deprecated": false,
+                "x-llama-stack-extra-body-params": [
+                    {
+                        "name": "shields",
+                        "schema": {
+                            "type": "array",
+                            "items": {
+                                "oneOf": [
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "$ref": "#/components/schemas/ResponseShieldSpec"
+                                    }
+                                ]
+                            }
+                        },
+                        "description": "List of shields to apply during response generation. Shields provide safety and content moderation.",
+                        "required": false
+                    }
+                ]
            }
        },
        "/v1/responses/{response_id}": {
@ -1862,8 +1882,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "Retrieve an OpenAI response by its ID.",
-                "description": "Retrieve an OpenAI response by its ID.",
+                "summary": "Get a model response.",
+                "description": "Get a model response.",
                "parameters": [
                    {
                        "name": "response_id",
@ -1905,8 +1925,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "Delete an OpenAI response by its ID.",
-                "description": "Delete an OpenAI response by its ID.",
+                "summary": "Delete a response.",
+                "description": "Delete a response.",
                "parameters": [
                    {
                        "name": "response_id",
@ -1950,8 +1970,8 @@
                "tags": [
                    "Agents"
                ],
-                "summary": "List input items for a given OpenAI response.",
-                "description": "List input items for a given OpenAI response.",
+                "summary": "List input items.",
+                "description": "List input items.",
                "parameters": [
                    {
                        "name": "response_id",
@ -2043,8 +2063,8 @@
                "tags": [
                    "Safety"
                ],
-                "summary": "Run a shield.",
-                "description": "Run a shield.",
+                "summary": "Run shield.",
+                "description": "Run shield.\nRun a shield.",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -4176,8 +4196,8 @@
                "tags": [
                    "Inspect"
                ],
-                "summary": "Get the version of the service.",
-                "description": "Get the version of the service.",
+                "summary": "Get version.",
+                "description": "Get version.\nGet the version of the service.",
                "parameters": [],
                "deprecated": false
            }
@ -9625,6 +9645,21 @@
                "title": "OpenAIResponseText",
                "description": "Text response configuration for OpenAI responses."
            },
+            "ResponseShieldSpec": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "description": "The type/identifier of the shield."
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type"
+                ],
+                "title": "ResponseShieldSpec",
+                "description": "Specification for a shield to apply during response generation."
+            },
            "OpenAIResponseInputTool": {
                "oneOf": [
                    {
@ -18452,16 +18487,18 @@
        },
        {
            "name": "Files",
-            "description": ""
+            "description": "This API is used to upload documents that can be used with other Llama Stack APIs.",
+            "x-displayName": "Files"
        },
        {
            "name": "Inference",
-            "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
-            "x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings."
+            "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
+            "x-displayName": "Inference"
        },
        {
            "name": "Inspect",
-            "description": ""
+            "description": "APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.",
+            "x-displayName": "Inspect"
        },
        {
            "name": "Models",
@ -18473,17 +18510,18 @@
        },
        {
            "name": "Prompts",
-            "description": "",
-            "x-displayName": "Protocol for prompt management operations."
+            "description": "Protocol for prompt management operations.",
+            "x-displayName": "Prompts"
        },
        {
            "name": "Providers",
-            "description": "",
-            "x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
+            "description": "Providers API for inspecting, listing, and modifying providers and their configurations.",
+            "x-displayName": "Providers"
        },
        {
            "name": "Safety",
-            "description": ""
+            "description": "OpenAI-compatible Moderations API.",
+            "x-displayName": "Safety"
        },
        {
            "name": "Scoring",
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -36,8 +36,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: List all chat completions.
-      description: List all chat completions.
+      summary: List chat completions.
+      description: List chat completions.
      parameters:
        - name: after
          in: query
@ -90,10 +90,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: >-
-        Generate an OpenAI-compatible chat completion for the given messages using
-        the specified model.
+      summary: Create chat completions.
      description: >-
+        Create chat completions.
+
        Generate an OpenAI-compatible chat completion for the given messages using
        the specified model.
      parameters: []
@ -125,8 +125,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: Describe a chat completion by its ID.
-      description: Describe a chat completion by its ID.
+      summary: Get chat completion.
+      description: >-
+        Get chat completion.
+
+        Describe a chat completion by its ID.
      parameters:
        - name: completion_id
          in: path
@ -156,10 +159,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: >-
-        Generate an OpenAI-compatible completion for the given prompt using the specified
-        model.
+      summary: Create completion.
      description: >-
+        Create completion.
+
        Generate an OpenAI-compatible completion for the given prompt using the specified
        model.
      parameters: []
@ -606,10 +609,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inference
-      summary: >-
-        Generate OpenAI-compatible embeddings for the given input using the specified
-        model.
+      summary: Create embeddings.
      description: >-
+        Create embeddings.
+
        Generate OpenAI-compatible embeddings for the given input using the specified
        model.
      parameters: []
@ -642,9 +645,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Returns a list of files that belong to the user's organization.
+      summary: List files.
      description: >-
+        List files.
+
        Returns a list of files that belong to the user's organization.
      parameters:
        - name: after
@ -702,11 +706,13 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Upload a file that can be used across various endpoints.
+      summary: Upload file.
      description: >-
+        Upload file.
+
        Upload a file that can be used across various endpoints.

+
        The file upload should be a multipart form request with:

        - file: The File object (not file name) to be uploaded.
@ -755,9 +761,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Returns information about a specific file.
+      summary: Retrieve file.
      description: >-
+        Retrieve file.
+
        Returns information about a specific file.
      parameters:
        - name: file_id
@ -789,8 +796,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: Delete a file.
-      description: Delete a file.
+      summary: Delete file.
+      description: Delete file.
      parameters:
        - name: file_id
          in: path
@ -822,9 +829,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Files
-      summary: >-
-        Returns the contents of the specified file.
+      summary: Retrieve file content.
      description: >-
+        Retrieve file content.
+
        Returns the contents of the specified file.
      parameters:
        - name: file_id
@ -857,9 +865,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inspect
-      summary: >-
-        Get the current health status of the service.
+      summary: Get health status.
      description: >-
+        Get health status.
+
        Get the current health status of the service.
      parameters: []
      deprecated: false
@ -885,9 +894,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inspect
-      summary: >-
-        List all available API routes with their methods and implementing providers.
+      summary: List routes.
      description: >-
+        List routes.
+
        List all available API routes with their methods and implementing providers.
      parameters: []
      deprecated: false
@ -936,8 +946,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
-      summary: Register a model.
-      description: Register a model.
+      summary: Register model.
+      description: >-
+        Register model.
+
+        Register a model.
      parameters: []
      requestBody:
        content:
@ -967,8 +980,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
-      summary: Get a model by its identifier.
-      description: Get a model by its identifier.
+      summary: Get model.
+      description: >-
+        Get model.
+
+        Get a model by its identifier.
      parameters:
        - name: model_id
          in: path
@ -993,8 +1009,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Models
-      summary: Unregister a model.
-      description: Unregister a model.
+      summary: Unregister model.
+      description: >-
+        Unregister model.
+
+        Unregister a model.
      parameters:
        - name: model_id
          in: path
@ -1025,9 +1044,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Safety
-      summary: >-
-        Classifies if text and/or image inputs are potentially harmful.
+      summary: Create moderation.
      description: >-
+        Create moderation.
+
        Classifies if text and/or image inputs are potentially harmful.
      parameters: []
      requestBody:
@ -1083,8 +1103,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: Create a new prompt.
-      description: Create a new prompt.
+      summary: Create prompt.
+      description: >-
+        Create prompt.
+
+        Create a new prompt.
      parameters: []
      requestBody:
        content:
@ -1114,9 +1137,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: >-
-        Get a prompt by its identifier and optional version.
+      summary: Get prompt.
      description: >-
+        Get prompt.
+
        Get a prompt by its identifier and optional version.
      parameters:
        - name: prompt_id
@ -1154,9 +1178,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: >-
-        Update an existing prompt (increments version).
+      summary: Update prompt.
      description: >-
+        Update prompt.
+
        Update an existing prompt (increments version).
      parameters:
        - name: prompt_id
@ -1188,8 +1213,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: Delete a prompt.
-      description: Delete a prompt.
+      summary: Delete prompt.
+      description: >-
+        Delete prompt.
+
+        Delete a prompt.
      parameters:
        - name: prompt_id
          in: path
@ -1220,9 +1248,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: >-
-        Set which version of a prompt should be the default in get_prompt (latest).
+      summary: Set prompt version.
      description: >-
+        Set prompt version.
+
        Set which version of a prompt should be the default in get_prompt (latest).
      parameters:
        - name: prompt_id
@ -1260,8 +1289,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Prompts
-      summary: List all versions of a specific prompt.
-      description: List all versions of a specific prompt.
+      summary: List prompt versions.
+      description: >-
+        List prompt versions.
+
+        List all versions of a specific prompt.
      parameters:
        - name: prompt_id
          in: path
@ -1293,8 +1325,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Providers
-      summary: List all available providers.
-      description: List all available providers.
+      summary: List providers.
+      description: >-
+        List providers.
+
+        List all available providers.
      parameters: []
      deprecated: false
  /v1/providers/{provider_id}:
@ -1319,9 +1354,10 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Providers
-      summary: >-
-        Get detailed information about a specific provider.
+      summary: Get provider.
      description: >-
+        Get provider.
+
        Get detailed information about a specific provider.
      parameters:
        - name: provider_id
@ -1352,8 +1388,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: List all OpenAI responses.
-      description: List all OpenAI responses.
+      summary: List all responses.
+      description: List all responses.
      parameters:
        - name: after
          in: query
@ -1404,8 +1440,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: Create a new OpenAI response.
-      description: Create a new OpenAI response.
+      summary: Create a model response.
+      description: Create a model response.
      parameters: []
      requestBody:
        content:
@ -1414,6 +1450,18 @@ paths:
              $ref: '#/components/schemas/CreateOpenaiResponseRequest'
        required: true
      deprecated: false
+      x-llama-stack-extra-body-params:
+        - name: shields
+          schema:
+            type: array
+            items:
+              oneOf:
+                - type: string
+                - $ref: '#/components/schemas/ResponseShieldSpec'
+          description: >-
+            List of shields to apply during response generation. Shields provide safety
+            and content moderation.
+          required: false
  /v1/responses/{response_id}:
    get:
      responses:
@ -1435,8 +1483,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: Retrieve an OpenAI response by its ID.
-      description: Retrieve an OpenAI response by its ID.
+      summary: Get a model response.
+      description: Get a model response.
      parameters:
        - name: response_id
          in: path
@ -1466,8 +1514,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: Delete an OpenAI response by its ID.
-      description: Delete an OpenAI response by its ID.
+      summary: Delete a response.
+      description: Delete a response.
      parameters:
        - name: response_id
          in: path
@ -1497,10 +1545,8 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Agents
-      summary: >-
-        List input items for a given OpenAI response.
-      description: >-
-        List input items for a given OpenAI response.
+      summary: List input items.
+      description: List input items.
      parameters:
        - name: response_id
          in: path
@ -1569,8 +1615,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Safety
-      summary: Run a shield.
-      description: Run a shield.
+      summary: Run shield.
+      description: >-
+        Run shield.
+
+        Run a shield.
      parameters: []
      requestBody:
        content:
@ -3126,8 +3175,11 @@ paths:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Inspect
-      summary: Get the version of the service.
-      description: Get the version of the service.
+      summary: Get version.
+      description: >-
+        Get version.
+
+        Get the version of the service.
      parameters: []
      deprecated: false
  /v1beta/datasetio/append-rows/{dataset_id}:
@ -7184,6 +7236,18 @@ components:
      title: OpenAIResponseText
      description: >-
        Text response configuration for OpenAI responses.
+    ResponseShieldSpec:
+      type: object
+      properties:
+        type:
+          type: string
+          description: The type/identifier of the shield.
+      additionalProperties: false
+      required:
+        - type
+      title: ResponseShieldSpec
+      description: >-
+        Specification for a shield to apply during response generation.
    OpenAIResponseInputTool:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -13771,9 +13835,16 @@ tags:
    x-displayName: >-
      Llama Stack Evaluation API for running evaluations on model and agent candidates.
  - name: Files
-    description: ''
+    description: >-
+      This API is used to upload documents that can be used with other Llama Stack
+      APIs.
+    x-displayName: Files
  - name: Inference
    description: >-
+      Llama Stack Inference API for generating completions, chat completions, and
+      embeddings.
+
+
      This API provides the raw interface to the underlying models. Two kinds of models
      are supported:

@ -13781,25 +13852,27 @@ tags:

      - Embedding models: these models generate embeddings to be used for semantic
      search.
-    x-displayName: >-
-      Llama Stack Inference API for generating completions, chat completions, and
-      embeddings.
+    x-displayName: Inference
  - name: Inspect
-    description: ''
+    description: >-
+      APIs for inspecting the Llama Stack service, including health status, available
+      API routes with methods and implementing providers.
+    x-displayName: Inspect
  - name: Models
    description: ''
  - name: PostTraining (Coming Soon)
    description: ''
  - name: Prompts
-    description: ''
-    x-displayName: >-
+    description: >-
      Protocol for prompt management operations.
+    x-displayName: Prompts
  - name: Providers
-    description: ''
-    x-displayName: >-
+    description: >-
      Providers API for inspecting, listing, and modifying providers and their configurations.
+    x-displayName: Providers
  - name: Safety
-    description: ''
+    description: OpenAI-compatible Moderations API.
+    x-displayName: Safety
  - name: Scoring
    description: ''
  - name: ScoringFunctions
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
 from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+from llama_stack.schema_utils import ExtraBodyField, json_schema_type, register_schema, webmethod

 from .openai_responses import (
    ListOpenAIResponseInputItem,
@ -42,6 +42,20 @@ from .openai_responses import (
 )


+@json_schema_type
+class ResponseShieldSpec(BaseModel):
+    """Specification for a shield to apply during response generation.
+
+    :param type: The type/identifier of the shield.
+    """
+
+    type: str
+    # TODO: more fields to be added for shield configuration
+
+
+ResponseShield = str | ResponseShieldSpec
+
+
 class Attachment(BaseModel):
    """An attachment to an agent turn.

@ -783,7 +797,7 @@ class Agents(Protocol):
        self,
        response_id: str,
    ) -> OpenAIResponseObject:
-        """Retrieve an OpenAI response by its ID.
+        """Get a model response.

        :param response_id: The ID of the OpenAI response to retrieve.
        :returns: An OpenAIResponseObject.
@ -805,13 +819,20 @@ class Agents(Protocol):
        tools: list[OpenAIResponseInputTool] | None = None,
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
+        shields: Annotated[
+            list[ResponseShield] | None,
+            ExtraBodyField(
+                "List of shields to apply during response generation. Shields provide safety and content moderation."
+            ),
+        ] = None,
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
-        """Create a new OpenAI response.
+        """Create a model response.

        :param input: Input message(s) to create the response.
        :param model: The underlying LLM used for completions.
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
        :param include: (Optional) Additional fields to include in the response.
+        :param shields: (Optional) List of shields to apply during response generation. Can be shield IDs (strings) or shield specifications.
        :returns: An OpenAIResponseObject.
        """
        ...
@ -825,7 +846,7 @@ class Agents(Protocol):
        model: str | None = None,
        order: Order | None = Order.desc,
    ) -> ListOpenAIResponseObject:
-        """List all OpenAI responses.
+        """List all responses.

        :param after: The ID of the last response to return.
        :param limit: The number of responses to return.
@ -848,7 +869,7 @@ class Agents(Protocol):
        limit: int | None = 20,
        order: Order | None = Order.desc,
    ) -> ListOpenAIResponseInputItem:
-        """List input items for a given OpenAI response.
+        """List input items.

        :param response_id: The ID of the response to retrieve input items for.
        :param after: An item ID to list items after, used for pagination.
@ -863,7 +884,7 @@ class Agents(Protocol):
    @webmethod(route="/openai/v1/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
-        """Delete an OpenAI response by its ID.
+        """Delete a response.

        :param response_id: The ID of the OpenAI response to delete.
        :returns: An OpenAIDeleteResponseObject
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -104,6 +104,11 @@ class OpenAIFileDeleteResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Files(Protocol):
+    """Files
+
+    This API is used to upload documents that can be used with other Llama Stack APIs.
+    """
+
    # OpenAI Files API Endpoints
    @webmethod(route="/openai/v1/files", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1)
@ -113,7 +118,8 @@ class Files(Protocol):
        purpose: Annotated[OpenAIFilePurpose, Form()],
        expires_after: Annotated[ExpiresAfter | None, Form()] = None,
    ) -> OpenAIFileObject:
-        """
+        """Upload file.
+
        Upload a file that can be used across various endpoints.

        The file upload should be a multipart form request with:
@ -137,7 +143,8 @@ class Files(Protocol):
        order: Order | None = Order.desc,
        purpose: OpenAIFilePurpose | None = None,
    ) -> ListOpenAIFileResponse:
-        """
+        """List files.
+
        Returns a list of files that belong to the user's organization.

        :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list. For instance, if you make a list request and receive 100 objects, ending with obj_foo, your subsequent call can include after=obj_foo in order to fetch the next page of the list.
@ -154,7 +161,8 @@ class Files(Protocol):
        self,
        file_id: str,
    ) -> OpenAIFileObject:
-        """
+        """Retrieve file.
+
        Returns information about a specific file.

        :param file_id: The ID of the file to use for this request.
@ -168,8 +176,7 @@ class Files(Protocol):
        self,
        file_id: str,
    ) -> OpenAIFileDeleteResponse:
-        """
-        Delete a file.
+        """Delete file.

        :param file_id: The ID of the file to use for this request.
        :returns: An OpenAIFileDeleteResponse indicating successful deletion.
@ -182,7 +189,8 @@ class Files(Protocol):
        self,
        file_id: str,
    ) -> Response:
-        """
+        """Retrieve file content.
+
        Returns the contents of the specified file.

        :param file_id: The ID of the file to use for this request.
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -1053,7 +1053,9 @@ class InferenceProvider(Protocol):
        # for fill-in-the-middle type completion
        suffix: str | None = None,
    ) -> OpenAICompletion:
-        """Generate an OpenAI-compatible completion for the given prompt using the specified model.
+        """Create completion.
+
+        Generate an OpenAI-compatible completion for the given prompt using the specified model.

        :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
        :param prompt: The prompt to generate a completion for.
@ -1105,7 +1107,9 @@ class InferenceProvider(Protocol):
        top_p: float | None = None,
        user: str | None = None,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        """Generate an OpenAI-compatible chat completion for the given messages using the specified model.
+        """Create chat completions.
+
+        Generate an OpenAI-compatible chat completion for the given messages using the specified model.

        :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
        :param messages: List of messages in the conversation.
@ -1144,7 +1148,9 @@ class InferenceProvider(Protocol):
        dimensions: int | None = None,
        user: str | None = None,
    ) -> OpenAIEmbeddingsResponse:
-        """Generate OpenAI-compatible embeddings for the given input using the specified model.
+        """Create embeddings.
+
+        Generate OpenAI-compatible embeddings for the given input using the specified model.

        :param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
        :param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
@ -1157,7 +1163,9 @@ class InferenceProvider(Protocol):


 class Inference(InferenceProvider):
-    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
+    """Inference
+
+    Llama Stack Inference API for generating completions, chat completions, and embeddings.

    This API provides the raw interface to the underlying models. Two kinds of models are supported:
    - LLM models: these models generate "raw" and "chat" (conversational) completions.
@ -1173,7 +1181,7 @@ class Inference(InferenceProvider):
        model: str | None = None,
        order: Order | None = Order.desc,
    ) -> ListOpenAIChatCompletionResponse:
-        """List all chat completions.
+        """List chat completions.

        :param after: The ID of the last chat completion to return.
        :param limit: The maximum number of chat completions to return.
@ -1188,7 +1196,9 @@ class Inference(InferenceProvider):
    )
    @webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
-        """Describe a chat completion by its ID.
+        """Get chat completion.
+
+        Describe a chat completion by its ID.

        :param completion_id: ID of the chat completion.
        :returns: A OpenAICompletionWithInputMessages.
--- a/llama_stack/apis/inspect/inspect.py
+++ b/llama_stack/apis/inspect/inspect.py
@ -58,9 +58,16 @@ class ListRoutesResponse(BaseModel):

@runtime_checkable
 class Inspect(Protocol):
+    """Inspect
+
+    APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.
+    """
+
    @webmethod(route="/inspect/routes", method="GET", level=LLAMA_STACK_API_V1)
    async def list_routes(self) -> ListRoutesResponse:
-        """List all available API routes with their methods and implementing providers.
+        """List routes.
+
+        List all available API routes with their methods and implementing providers.

        :returns: Response containing information about all available routes.
        """
@ -68,7 +75,9 @@ class Inspect(Protocol):

    @webmethod(route="/health", method="GET", level=LLAMA_STACK_API_V1)
    async def health(self) -> HealthInfo:
-        """Get the current health status of the service.
+        """Get health status.
+
+        Get the current health status of the service.

        :returns: Health information indicating if the service is operational.
        """
@ -76,7 +85,9 @@ class Inspect(Protocol):

    @webmethod(route="/version", method="GET", level=LLAMA_STACK_API_V1)
    async def version(self) -> VersionInfo:
-        """Get the version of the service.
+        """Get version.
+
+        Get the version of the service.

        :returns: Version information containing the service version number.
        """
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -124,7 +124,9 @@ class Models(Protocol):
        self,
        model_id: str,
    ) -> Model:
-        """Get a model by its identifier.
+        """Get model.
+
+        Get a model by its identifier.

        :param model_id: The identifier of the model to get.
        :returns: A Model.
@ -140,7 +142,9 @@ class Models(Protocol):
        metadata: dict[str, Any] | None = None,
        model_type: ModelType | None = None,
    ) -> Model:
-        """Register a model.
+        """Register model.
+
+        Register a model.

        :param model_id: The identifier of the model to register.
        :param provider_model_id: The identifier of the model in the provider.
@ -156,7 +160,9 @@ class Models(Protocol):
        self,
        model_id: str,
    ) -> None:
-        """Unregister a model.
+        """Unregister model.
+
+        Unregister a model.

        :param model_id: The identifier of the model to unregister.
        """
--- a/llama_stack/apis/prompts/prompts.py
+++ b/llama_stack/apis/prompts/prompts.py
@ -94,7 +94,9 @@ class ListPromptsResponse(BaseModel):
@runtime_checkable
@trace_protocol
 class Prompts(Protocol):
-    """Protocol for prompt management operations."""
+    """Prompts
+
+    Protocol for prompt management operations."""

    @webmethod(route="/prompts", method="GET", level=LLAMA_STACK_API_V1)
    async def list_prompts(self) -> ListPromptsResponse:
@ -109,7 +111,9 @@ class Prompts(Protocol):
        self,
        prompt_id: str,
    ) -> ListPromptsResponse:
-        """List all versions of a specific prompt.
+        """List prompt versions.
+
+        List all versions of a specific prompt.

        :param prompt_id: The identifier of the prompt to list versions for.
        :returns: A ListPromptsResponse containing all versions of the prompt.
@ -122,7 +126,9 @@ class Prompts(Protocol):
        prompt_id: str,
        version: int | None = None,
    ) -> Prompt:
-        """Get a prompt by its identifier and optional version.
+        """Get prompt.
+
+        Get a prompt by its identifier and optional version.

        :param prompt_id: The identifier of the prompt to get.
        :param version: The version of the prompt to get (defaults to latest).
@ -136,7 +142,9 @@ class Prompts(Protocol):
        prompt: str,
        variables: list[str] | None = None,
    ) -> Prompt:
-        """Create a new prompt.
+        """Create prompt.
+
+        Create a new prompt.

        :param prompt: The prompt text content with variable placeholders.
        :param variables: List of variable names that can be used in the prompt template.
@ -153,7 +161,9 @@ class Prompts(Protocol):
        variables: list[str] | None = None,
        set_as_default: bool = True,
    ) -> Prompt:
-        """Update an existing prompt (increments version).
+        """Update prompt.
+
+        Update an existing prompt (increments version).

        :param prompt_id: The identifier of the prompt to update.
        :param prompt: The updated prompt text content.
@ -169,7 +179,9 @@ class Prompts(Protocol):
        self,
        prompt_id: str,
    ) -> None:
-        """Delete a prompt.
+        """Delete prompt.
+
+        Delete a prompt.

        :param prompt_id: The identifier of the prompt to delete.
        """
@ -181,7 +193,9 @@ class Prompts(Protocol):
        prompt_id: str,
        version: int,
    ) -> Prompt:
-        """Set which version of a prompt should be the default in get_prompt (latest).
+        """Set prompt version.
+
+        Set which version of a prompt should be the default in get_prompt (latest).

        :param prompt_id: The identifier of the prompt.
        :param version: The version to set as default.
--- a/llama_stack/apis/providers/providers.py
+++ b/llama_stack/apis/providers/providers.py
@ -42,13 +42,16 @@ class ListProvidersResponse(BaseModel):

@runtime_checkable
 class Providers(Protocol):
-    """
+    """Providers
+
    Providers API for inspecting, listing, and modifying providers and their configurations.
    """

    @webmethod(route="/providers", method="GET", level=LLAMA_STACK_API_V1)
    async def list_providers(self) -> ListProvidersResponse:
-        """List all available providers.
+        """List providers.
+
+        List all available providers.

        :returns: A ListProvidersResponse containing information about all providers.
        """
@ -56,7 +59,9 @@ class Providers(Protocol):

    @webmethod(route="/providers/{provider_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def inspect_provider(self, provider_id: str) -> ProviderInfo:
-        """Get detailed information about a specific provider.
+        """Get provider.
+
+        Get detailed information about a specific provider.

        :param provider_id: The ID of the provider to inspect.
        :returns: A ProviderInfo object containing the provider's details.
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -96,6 +96,11 @@ class ShieldStore(Protocol):
@runtime_checkable
@trace_protocol
 class Safety(Protocol):
+    """Safety
+
+    OpenAI-compatible Moderations API.
+    """
+
    shield_store: ShieldStore

    @webmethod(route="/safety/run-shield", method="POST", level=LLAMA_STACK_API_V1)
@ -105,7 +110,9 @@ class Safety(Protocol):
        messages: list[Message],
        params: dict[str, Any],
    ) -> RunShieldResponse:
-        """Run a shield.
+        """Run shield.
+
+        Run a shield.

        :param shield_id: The identifier of the shield to run.
        :param messages: The messages to run the shield on.
@ -117,7 +124,9 @@ class Safety(Protocol):
    @webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
-        """Classifies if text and/or image inputs are potentially harmful.
+        """Create moderation.
+
+        Classifies if text and/or image inputs are potentially harmful.
        :param input: Input (or inputs) to classify.
        Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
        :param model: The content moderation model you would like to use.
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -6,11 +6,18 @@

 import argparse
 import os
+import ssl
 import subprocess
 from pathlib import Path

+import uvicorn
+import yaml
+
 from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.subcommand import Subcommand
+from llama_stack.core.datatypes import LoggingConfig, StackRunConfig
+from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars, validate_env_pair
+from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
 from llama_stack.log import get_logger

 REPO_ROOT = Path(__file__).parent.parent.parent.parent
@ -146,23 +153,7 @@ class StackRun(Subcommand):
        # using the current environment packages.
        if not image_type and not image_name:
            logger.info("No image type or image name provided. Assuming environment packages.")
-            from llama_stack.core.server.server import main as server_main
-
-            # Build the server args from the current args passed to the CLI
-            server_args = argparse.Namespace()
-            for arg in vars(args):
-                # If this is a function, avoid passing it
-                # "args" contains:
-                # func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
-                if callable(getattr(args, arg)):
-                    continue
-                if arg == "config":
-                    server_args.config = str(config_file)
-                else:
-                    setattr(server_args, arg, getattr(args, arg))
-
-            # Run the server
-            server_main(server_args)
+            self._uvicorn_run(config_file, args)
        else:
            run_args = formulate_run_args(image_type, image_name)

@ -184,6 +175,76 @@ class StackRun(Subcommand):

            run_command(run_args)

+    def _uvicorn_run(self, config_file: Path | None, args: argparse.Namespace) -> None:
+        if not config_file:
+            self.parser.error("Config file is required")
+
+        # Set environment variables if provided
+        if args.env:
+            for env_pair in args.env:
+                try:
+                    key, value = validate_env_pair(env_pair)
+                    logger.info(f"Setting environment variable {key} => {value}")
+                    os.environ[key] = value
+                except ValueError as e:
+                    logger.error(f"Error: {str(e)}")
+                    self.parser.error(f"Invalid environment variable format: {env_pair}")
+
+        config_file = resolve_config_or_distro(str(config_file), Mode.RUN)
+        with open(config_file) as fp:
+            config_contents = yaml.safe_load(fp)
+            if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
+                logger_config = LoggingConfig(**cfg)
+            else:
+                logger_config = None
+            config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
+
+        port = args.port or config.server.port
+        host = config.server.host or ["::", "0.0.0.0"]
+
+        # Set the config file in environment so create_app can find it
+        os.environ["LLAMA_STACK_CONFIG"] = str(config_file)
+
+        uvicorn_config = {
+            "factory": True,
+            "host": host,
+            "port": port,
+            "lifespan": "on",
+            "log_level": logger.getEffectiveLevel(),
+            "log_config": logger_config,
+        }
+
+        keyfile = config.server.tls_keyfile
+        certfile = config.server.tls_certfile
+        if keyfile and certfile:
+            uvicorn_config["ssl_keyfile"] = config.server.tls_keyfile
+            uvicorn_config["ssl_certfile"] = config.server.tls_certfile
+            if config.server.tls_cafile:
+                uvicorn_config["ssl_ca_certs"] = config.server.tls_cafile
+                uvicorn_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
+
+            logger.info(
+                f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}\n  CA: {config.server.tls_cafile}"
+            )
+        else:
+            logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
+
+        logger.info(f"Listening on {host}:{port}")
+
+        # We need to catch KeyboardInterrupt because uvicorn's signal handling
+        # re-raises SIGINT signals using signal.raise_signal(), which Python
+        # converts to KeyboardInterrupt. Without this catch, we'd get a confusing
+        # stack trace when using Ctrl+C or kill -2 (SIGINT).
+        # SIGTERM (kill -15) works fine without this because Python doesn't
+        # have a default handler for it.
+        #
+        # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
+        # signal handling but this is quite intrusive and not worth the effort.
+        try:
+            uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)
+        except (KeyboardInterrupt, SystemExit):
+            logger.info("Received interrupt signal, shutting down gracefully...")
+
    def _start_ui_development_server(self, stack_server_port: int):
        logger.info("Attempting to start UI development server...")
        # Check if npm is available
--- a/llama_stack/core/build_container.sh
+++ b/llama_stack/core/build_container.sh
@ -324,14 +324,14 @@ fi
 RUN pip uninstall -y uv
 EOF

-# If a run config is provided, we use the --config flag
+# If a run config is provided, we use the llama stack CLI
 if [[ -n "$run_config" ]]; then
  add_to_container << EOF
-ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$RUN_CONFIG_PATH"]
+ENTRYPOINT ["llama", "stack", "run", "$RUN_CONFIG_PATH"]
 EOF
 elif [[ "$distro_or_config" != *.yaml ]]; then
  add_to_container << EOF
-ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$distro_or_config"]
+ENTRYPOINT ["llama", "stack", "run", "$distro_or_config"]
 EOF
 fi

--- a/llama_stack/core/distribution.py
+++ b/llama_stack/core/distribution.py
@ -243,6 +243,7 @@ def get_external_providers_from_module(
                    spec = module.get_provider_spec()
                else:
                    # pass in a partially filled out provider spec to satisfy the registry -- knowing we will be overwriting it later upon build and run
+                    # in the case we are building we CANNOT import this module of course because it has not been installed.
                    spec = ProviderSpec(
                        api=Api(provider_api),
                        provider_type=provider.provider_type,
@ -251,9 +252,20 @@ def get_external_providers_from_module(
                        config_class="",
                    )
                provider_type = provider.provider_type
-                # in the case we are building we CANNOT import this module of course because it has not been installed.
-                # return a partially filled out spec that the build script will populate.
-                registry[Api(provider_api)][provider_type] = spec
+                if isinstance(spec, list):
+                    # optionally allow people to pass inline and remote provider specs as a returned list.
+                    # with the old method, users could pass in directories of specs using overlapping code
+                    # we want to ensure we preserve that flexibility in this method.
+                    logger.info(
+                        f"Detected a list of external provider specs from {provider.module} adding all to the registry"
+                    )
+                    for provider_spec in spec:
+                        if provider_spec.provider_type != provider.provider_type:
+                            continue
+                        logger.info(f"Adding {provider.provider_type} to registry")
+                        registry[Api(provider_api)][provider.provider_type] = provider_spec
+                else:
+                    registry[Api(provider_api)][provider_type] = spec
            except ModuleNotFoundError as exc:
                raise ValueError(
                    "get_provider_spec not found. If specifying an external provider via `module` in the Provider spec, the Provider must have the `provider.get_provider_spec` module available"
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -374,6 +374,10 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        body = options.params or {}
        body |= options.json_data or {}

+        # Merge extra_json parameters (extra_body from SDK is converted to extra_json)
+        if hasattr(options, "extra_json") and options.extra_json:
+            body |= options.extra_json
+
        matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
        body |= path_params

--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import argparse
 import asyncio
 import concurrent.futures
 import functools
@ -12,7 +11,6 @@ import inspect
 import json
 import logging  # allow-direct-logging
 import os
-import ssl
 import sys
 import traceback
 import warnings
@ -35,7 +33,6 @@ from pydantic import BaseModel, ValidationError

 from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
 from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
 from llama_stack.core.access_control.access_control import AccessDeniedError
 from llama_stack.core.datatypes import (
    AuthenticationRequiredError,
@ -55,7 +52,6 @@ from llama_stack.core.stack import (
    Stack,
    cast_image_name_to_string,
    replace_env_vars,
-    validate_env_pair,
 )
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
@ -333,23 +329,18 @@ class ClientVersionMiddleware:
        return await self.app(scope, receive, send)


-def create_app(
-    config_file: str | None = None,
-    env_vars: list[str] | None = None,
-) -> StackApp:
+def create_app() -> StackApp:
    """Create and configure the FastAPI application.

-    Args:
-        config_file: Path to config file. If None, uses LLAMA_STACK_CONFIG env var or default resolution.
-        env_vars: List of environment variables in KEY=value format.
-        disable_version_check: Whether to disable version checking. If None, uses LLAMA_STACK_DISABLE_VERSION_CHECK env var.
+    This factory function reads configuration from environment variables:
+    - LLAMA_STACK_CONFIG: Path to config file (required)

    Returns:
        Configured StackApp instance.
    """
-    config_file = config_file or os.getenv("LLAMA_STACK_CONFIG")
+    config_file = os.getenv("LLAMA_STACK_CONFIG")
    if config_file is None:
-        raise ValueError("No config file provided and LLAMA_STACK_CONFIG env var is not set")
+        raise ValueError("LLAMA_STACK_CONFIG environment variable is required")

    config_file = resolve_config_or_distro(config_file, Mode.RUN)

@ -361,16 +352,6 @@ def create_app(
            logger_config = LoggingConfig(**cfg)
        logger = get_logger(name=__name__, category="core::server", config=logger_config)

-        if env_vars:
-            for env_pair in env_vars:
-                try:
-                    key, value = validate_env_pair(env_pair)
-                    logger.info(f"Setting environment variable {key} => {value}")
-                    os.environ[key] = value
-                except ValueError as e:
-                    logger.error(f"Error: {str(e)}")
-                    raise ValueError(f"Invalid environment variable format: {env_pair}") from e
-
        config = replace_env_vars(config_contents)
        config = StackRunConfig(**cast_image_name_to_string(config))

@ -494,101 +475,6 @@ def create_app(
    return app


-def main(args: argparse.Namespace | None = None):
-    """Start the LlamaStack server."""
-    parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
-
-    add_config_distro_args(parser)
-    parser.add_argument(
-        "--port",
-        type=int,
-        default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
-        help="Port to listen on",
-    )
-    parser.add_argument(
-        "--env",
-        action="append",
-        help="Environment variables in KEY=value format. Can be specified multiple times.",
-    )
-
-    # Determine whether the server args are being passed by the "run" command, if this is the case
-    # the args will be passed as a Namespace object to the main function, otherwise they will be
-    # parsed from the command line
-    if args is None:
-        args = parser.parse_args()
-
-    config_or_distro = get_config_from_args(args)
-
-    try:
-        app = create_app(
-            config_file=config_or_distro,
-            env_vars=args.env,
-        )
-    except Exception as e:
-        logger.error(f"Error creating app: {str(e)}")
-        sys.exit(1)
-
-    config_file = resolve_config_or_distro(config_or_distro, Mode.RUN)
-    with open(config_file) as fp:
-        config_contents = yaml.safe_load(fp)
-        if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
-            logger_config = LoggingConfig(**cfg)
-        else:
-            logger_config = None
-        config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
-
-    import uvicorn
-
-    # Configure SSL if certificates are provided
-    port = args.port or config.server.port
-
-    ssl_config = None
-    keyfile = config.server.tls_keyfile
-    certfile = config.server.tls_certfile
-
-    if keyfile and certfile:
-        ssl_config = {
-            "ssl_keyfile": keyfile,
-            "ssl_certfile": certfile,
-        }
-        if config.server.tls_cafile:
-            ssl_config["ssl_ca_certs"] = config.server.tls_cafile
-            ssl_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
-            logger.info(
-                f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}\n  CA: {config.server.tls_cafile}"
-            )
-        else:
-            logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
-
-    listen_host = config.server.host or ["::", "0.0.0.0"]
-    logger.info(f"Listening on {listen_host}:{port}")
-
-    uvicorn_config = {
-        "app": app,
-        "host": listen_host,
-        "port": port,
-        "lifespan": "on",
-        "log_level": logger.getEffectiveLevel(),
-        "log_config": logger_config,
-    }
-    if ssl_config:
-        uvicorn_config.update(ssl_config)
-
-    # We need to catch KeyboardInterrupt because uvicorn's signal handling
-    # re-raises SIGINT signals using signal.raise_signal(), which Python
-    # converts to KeyboardInterrupt. Without this catch, we'd get a confusing
-    # stack trace when using Ctrl+C or kill -2 (SIGINT).
-    # SIGTERM (kill -15) works fine without this because Python doesn't
-    # have a default handler for it.
-    #
-    # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
-    # signal handling but this is quite intrusive and not worth the effort.
-    try:
-        asyncio.run(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
-    except (KeyboardInterrupt, SystemExit):
-        logger.info("Received interrupt signal, shutting down gracefully...")
-
-
 def _log_run_config(run_config: StackRunConfig):
    """Logs the run config with redacted fields and disabled providers removed."""
    logger.info("Run configuration:")
@ -615,7 +501,3 @@ def remove_disabled_providers(obj):
        return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None]
    else:
        return obj
-
-
-if __name__ == "__main__":
-    main()
--- a/llama_stack/core/start_stack.sh
+++ b/llama_stack/core/start_stack.sh
@ -116,7 +116,7 @@ if [[ "$env_type" == "venv" ]]; then
        yaml_config_arg=""
    fi

-    $PYTHON_BINARY -m llama_stack.core.server.server \
+    llama stack run \
    $yaml_config_arg \
    --port "$port" \
    $env_vars \
--- a/llama_stack/models/llama/tokenizer_utils.py
+++ b/llama_stack/models/llama/tokenizer_utils.py
@ -9,7 +9,7 @@ from pathlib import Path

 from llama_stack.log import get_logger

-logger = get_logger(__name__, "tokenizer_utils")
+logger = get_logger(__name__, "models")


 def load_bpe_file(model_path: Path) -> dict[bytes, int]:
--- a/llama_stack/providers/inline/agents/meta_reference/agents.py
+++ b/llama_stack/providers/inline/agents/meta_reference/agents.py
@ -329,6 +329,7 @@ class MetaReferenceAgentsImpl(Agents):
        tools: list[OpenAIResponseInputTool] | None = None,
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,
+        shields: list | None = None,
    ) -> OpenAIResponseObject:
        return await self.openai_responses_impl.create_openai_response(
            input,
@ -342,6 +343,7 @@ class MetaReferenceAgentsImpl(Agents):
            tools,
            include,
            max_infer_iters,
+            shields,
        )

    async def list_openai_responses(
--- a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
+++ b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@ -208,10 +208,15 @@ class OpenAIResponsesImpl:
        tools: list[OpenAIResponseInputTool] | None = None,
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,
+        shields: list | None = None,
    ):
        stream = bool(stream)
        text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text

+        # Shields parameter received via extra_body - not yet implemented
+        if shields is not None:
+            raise NotImplementedError("Shields parameter is not yet implemented in the meta-reference provider")
+
        stream_gen = self._create_streaming_response(
            input=input,
            model=model,
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -52,9 +52,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter_type="cerebras",
            provider_type="remote::cerebras",
-            pip_packages=[
-                "cerebras_cloud_sdk",
-            ],
+            pip_packages=[],
            module="llama_stack.providers.remote.inference.cerebras",
            config_class="llama_stack.providers.remote.inference.cerebras.CerebrasImplConfig",
            description="Cerebras inference provider for running models on Cerebras Cloud platform.",
@ -169,7 +167,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter_type="openai",
            provider_type="remote::openai",
-            pip_packages=["litellm"],
+            pip_packages=[],
            module="llama_stack.providers.remote.inference.openai",
            config_class="llama_stack.providers.remote.inference.openai.OpenAIConfig",
            provider_data_validator="llama_stack.providers.remote.inference.openai.config.OpenAIProviderDataValidator",
@ -179,7 +177,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter_type="anthropic",
            provider_type="remote::anthropic",
-            pip_packages=["litellm"],
+            pip_packages=["anthropic"],
            module="llama_stack.providers.remote.inference.anthropic",
            config_class="llama_stack.providers.remote.inference.anthropic.AnthropicConfig",
            provider_data_validator="llama_stack.providers.remote.inference.anthropic.config.AnthropicProviderDataValidator",
@ -189,9 +187,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter_type="gemini",
            provider_type="remote::gemini",
-            pip_packages=[
-                "litellm",
-            ],
+            pip_packages=[],
            module="llama_stack.providers.remote.inference.gemini",
            config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
            provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
@ -202,7 +198,6 @@ def available_providers() -> list[ProviderSpec]:
            adapter_type="vertexai",
            provider_type="remote::vertexai",
            pip_packages=[
-                "litellm",
                "google-cloud-aiplatform",
            ],
            module="llama_stack.providers.remote.inference.vertexai",
@ -233,9 +228,7 @@ Available Models:
            api=Api.inference,
            adapter_type="groq",
            provider_type="remote::groq",
-            pip_packages=[
-                "litellm",
-            ],
+            pip_packages=[],
            module="llama_stack.providers.remote.inference.groq",
            config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
            provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
@ -245,7 +238,7 @@ Available Models:
            api=Api.inference,
            adapter_type="llama-openai-compat",
            provider_type="remote::llama-openai-compat",
-            pip_packages=["litellm"],
+            pip_packages=[],
            module="llama_stack.providers.remote.inference.llama_openai_compat",
            config_class="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaCompatConfig",
            provider_data_validator="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaProviderDataValidator",
@ -255,9 +248,7 @@ Available Models:
            api=Api.inference,
            adapter_type="sambanova",
            provider_type="remote::sambanova",
-            pip_packages=[
-                "litellm",
-            ],
+            pip_packages=[],
            module="llama_stack.providers.remote.inference.sambanova",
            config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
            provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
@ -287,7 +278,7 @@ Available Models:
            api=Api.inference,
            provider_type="remote::azure",
            adapter_type="azure",
-            pip_packages=["litellm"],
+            pip_packages=[],
            module="llama_stack.providers.remote.inference.azure",
            config_class="llama_stack.providers.remote.inference.azure.AzureConfig",
            provider_data_validator="llama_stack.providers.remote.inference.azure.config.AzureProviderDataValidator",
--- a/llama_stack/providers/remote/inference/anthropic/init.py
+++ b/llama_stack/providers/remote/inference/anthropic/init.py
@ -10,6 +10,6 @@ from .config import AnthropicConfig
 async def get_adapter_impl(config: AnthropicConfig, _deps):
    from .anthropic import AnthropicInferenceAdapter

-    impl = AnthropicInferenceAdapter(config)
+    impl = AnthropicInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/anthropic/anthropic.py
+++ b/llama_stack/providers/remote/inference/anthropic/anthropic.py
@ -4,13 +4,19 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+from collections.abc import Iterable
+
+from anthropic import AsyncAnthropic
+
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import AnthropicConfig


-class AnthropicInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
+class AnthropicInferenceAdapter(OpenAIMixin):
+    config: AnthropicConfig
+
+    provider_data_api_key_field: str = "anthropic_api_key"
    # source: https://docs.claude.com/en/docs/build-with-claude/embeddings
    # TODO: add support for voyageai, which is where these models are hosted
    # embedding_model_metadata = {
@ -23,22 +29,11 @@ class AnthropicInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    #     "voyage-multimodal-3": {"embedding_dimension": 1024, "context_length": 32000},
    # }

-    def __init__(self, config: AnthropicConfig) -> None:
-        LiteLLMOpenAIMixin.__init__(
-            self,
-            litellm_provider_name="anthropic",
-            api_key_from_config=config.api_key,
-            provider_data_api_key_field="anthropic_api_key",
-        )
-        self.config = config
-
-    async def initialize(self) -> None:
-        await super().initialize()
-
-    async def shutdown(self) -> None:
-        await super().shutdown()
-
-    get_api_key = LiteLLMOpenAIMixin.get_api_key
+    def get_api_key(self) -> str:
+        return self.config.api_key or ""

    def get_base_url(self):
        return "https://api.anthropic.com/v1"
+
+    async def list_provider_model_ids(self) -> Iterable[str]:
+        return [m.id async for m in AsyncAnthropic(api_key=self.get_api_key()).models.list()]
--- a/llama_stack/providers/remote/inference/azure/init.py
+++ b/llama_stack/providers/remote/inference/azure/init.py
@ -10,6 +10,6 @@ from .config import AzureConfig
 async def get_adapter_impl(config: AzureConfig, _deps):
    from .azure import AzureInferenceAdapter

-    impl = AzureInferenceAdapter(config)
+    impl = AzureInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/azure/azure.py
+++ b/llama_stack/providers/remote/inference/azure/azure.py
@ -4,31 +4,20 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any
 from urllib.parse import urljoin

-from llama_stack.apis.inference import ChatCompletionRequest
-from llama_stack.providers.utils.inference.litellm_openai_mixin import (
-    LiteLLMOpenAIMixin,
-)
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import AzureConfig


-class AzureInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
-    def __init__(self, config: AzureConfig) -> None:
-        LiteLLMOpenAIMixin.__init__(
-            self,
-            litellm_provider_name="azure",
-            api_key_from_config=config.api_key.get_secret_value(),
-            provider_data_api_key_field="azure_api_key",
-            openai_compat_api_base=str(config.api_base),
-        )
-        self.config = config
+class AzureInferenceAdapter(OpenAIMixin):
+    config: AzureConfig

-    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
-    get_api_key = LiteLLMOpenAIMixin.get_api_key
+    provider_data_api_key_field: str = "azure_api_key"
+
+    def get_api_key(self) -> str:
+        return self.config.api_key.get_secret_value()

    def get_base_url(self) -> str:
        """
@ -37,26 +26,3 @@ class AzureInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        Returns the Azure API base URL from the configuration.
        """
        return urljoin(str(self.config.api_base), "/openai/v1")
-
-    async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
-        # Get base parameters from parent
-        params = await super()._get_params(request)
-
-        # Add Azure specific parameters
-        provider_data = self.get_request_provider_data()
-        if provider_data:
-            if getattr(provider_data, "azure_api_key", None):
-                params["api_key"] = provider_data.azure_api_key
-            if getattr(provider_data, "azure_api_base", None):
-                params["api_base"] = provider_data.azure_api_base
-            if getattr(provider_data, "azure_api_version", None):
-                params["api_version"] = provider_data.azure_api_version
-            if getattr(provider_data, "azure_api_type", None):
-                params["api_type"] = provider_data.azure_api_type
-        else:
-            params["api_key"] = self.config.api_key.get_secret_value()
-            params["api_base"] = str(self.config.api_base)
-            params["api_version"] = self.config.api_version
-            params["api_type"] = self.config.api_type
-
-        return params
--- a/llama_stack/providers/remote/inference/cerebras/init.py
+++ b/llama_stack/providers/remote/inference/cerebras/init.py
@ -12,7 +12,7 @@ async def get_adapter_impl(config: CerebrasImplConfig, _deps):

    assert isinstance(config, CerebrasImplConfig), f"Unexpected config type: {type(config)}"

-    impl = CerebrasInferenceAdapter(config)
+    impl = CerebrasInferenceAdapter(config=config)

    await impl.initialize()

--- a/llama_stack/providers/remote/inference/cerebras/cerebras.py
+++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py
@ -6,39 +6,14 @@

 from urllib.parse import urljoin

-from cerebras.cloud.sdk import AsyncCerebras
-
-from llama_stack.apis.inference import (
-    ChatCompletionRequest,
-    CompletionRequest,
-    Inference,
-    OpenAIEmbeddingsResponse,
-    TopKSamplingStrategy,
-)
-from llama_stack.providers.utils.inference.openai_compat import (
-    get_sampling_options,
-)
+from llama_stack.apis.inference import OpenAIEmbeddingsResponse
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-    completion_request_to_prompt,
-)

 from .config import CerebrasImplConfig


-class CerebrasInferenceAdapter(
-    OpenAIMixin,
-    Inference,
-):
-    def __init__(self, config: CerebrasImplConfig) -> None:
-        self.config = config
-
-        # TODO: make this use provider data, etc. like other providers
-        self._cerebras_client = AsyncCerebras(
-            base_url=self.config.base_url,
-            api_key=self.config.api_key.get_secret_value(),
-        )
+class CerebrasInferenceAdapter(OpenAIMixin):
+    config: CerebrasImplConfig

    def get_api_key(self) -> str:
        return self.config.api_key.get_secret_value()
@ -46,31 +21,6 @@ class CerebrasInferenceAdapter(
    def get_base_url(self) -> str:
        return urljoin(self.config.base_url, "v1")

-    async def initialize(self) -> None:
-        return
-
-    async def shutdown(self) -> None:
-        pass
-
-    async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
-        if request.sampling_params and isinstance(request.sampling_params.strategy, TopKSamplingStrategy):
-            raise ValueError("`top_k` not supported by Cerebras")
-
-        prompt = ""
-        if isinstance(request, ChatCompletionRequest):
-            prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model))
-        elif isinstance(request, CompletionRequest):
-            prompt = await completion_request_to_prompt(request)
-        else:
-            raise ValueError(f"Unknown request type {type(request)}")
-
-        return {
-            "model": request.model,
-            "prompt": prompt,
-            "stream": request.stream,
-            **get_sampling_options(request.sampling_params),
-        }
-
    async def openai_embeddings(
        self,
        model: str,
--- a/llama_stack/providers/remote/inference/cerebras/config.py
+++ b/llama_stack/providers/remote/inference/cerebras/config.py
@ -22,7 +22,7 @@ class CerebrasImplConfig(RemoteInferenceProviderConfig):
        description="Base URL for the Cerebras API",
    )
    api_key: SecretStr = Field(
-        default=SecretStr(os.environ.get("CEREBRAS_API_KEY")),
+        default=SecretStr(os.environ.get("CEREBRAS_API_KEY")),  # type: ignore[arg-type]
        description="Cerebras API Key",
    )

--- a/llama_stack/providers/remote/inference/databricks/init.py
+++ b/llama_stack/providers/remote/inference/databricks/init.py
@ -11,6 +11,6 @@ async def get_adapter_impl(config: DatabricksImplConfig, _deps):
    from .databricks import DatabricksInferenceAdapter

    assert isinstance(config, DatabricksImplConfig), f"Unexpected config type: {type(config)}"
-    impl = DatabricksInferenceAdapter(config)
+    impl = DatabricksInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/databricks/config.py
+++ b/llama_stack/providers/remote/inference/databricks/config.py
@ -14,12 +14,12 @@ from llama_stack.schema_utils import json_schema_type

@json_schema_type
 class DatabricksImplConfig(RemoteInferenceProviderConfig):
-    url: str = Field(
+    url: str | None = Field(
        default=None,
        description="The URL for the Databricks model serving endpoint",
    )
    api_token: SecretStr = Field(
-        default=SecretStr(None),
+        default=SecretStr(None),  # type: ignore[arg-type]
        description="The Databricks API token",
    )

--- a/llama_stack/providers/remote/inference/databricks/databricks.py
+++ b/llama_stack/providers/remote/inference/databricks/databricks.py
@ -4,16 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+from collections.abc import Iterable
 from typing import Any

 from databricks.sdk import WorkspaceClient

-from llama_stack.apis.inference import (
-    Inference,
-    Model,
-    OpenAICompletion,
-)
-from llama_stack.apis.models import ModelType
+from llama_stack.apis.inference import OpenAICompletion
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

@ -22,30 +18,31 @@ from .config import DatabricksImplConfig
 logger = get_logger(name=__name__, category="inference::databricks")


-class DatabricksInferenceAdapter(
-    OpenAIMixin,
-    Inference,
-):
+class DatabricksInferenceAdapter(OpenAIMixin):
+    config: DatabricksImplConfig
+
    # source: https://docs.databricks.com/aws/en/machine-learning/foundation-model-apis/supported-models
-    embedding_model_metadata = {
+    embedding_model_metadata: dict[str, dict[str, int]] = {
        "databricks-gte-large-en": {"embedding_dimension": 1024, "context_length": 8192},
        "databricks-bge-large-en": {"embedding_dimension": 1024, "context_length": 512},
    }

-    def __init__(self, config: DatabricksImplConfig) -> None:
-        self.config = config
-
    def get_api_key(self) -> str:
        return self.config.api_token.get_secret_value()

    def get_base_url(self) -> str:
        return f"{self.config.url}/serving-endpoints"

-    async def initialize(self) -> None:
-        return
+    async def list_provider_model_ids(self) -> Iterable[str]:
+        return [
+            endpoint.name
+            for endpoint in WorkspaceClient(
+                host=self.config.url, token=self.get_api_key()
+            ).serving_endpoints.list()  # TODO: this is not async
+        ]

-    async def shutdown(self) -> None:
-        pass
+    async def should_refresh_models(self) -> bool:
+        return False

    async def openai_completion(
        self,
@ -71,32 +68,3 @@ class DatabricksInferenceAdapter(
        suffix: str | None = None,
    ) -> OpenAICompletion:
        raise NotImplementedError()
-
-    async def list_models(self) -> list[Model] | None:
-        self._model_cache = {}  # from OpenAIMixin
-        ws_client = WorkspaceClient(host=self.config.url, token=self.get_api_key())  # TODO: this is not async
-        endpoints = ws_client.serving_endpoints.list()
-        for endpoint in endpoints:
-            model = Model(
-                provider_id=self.__provider_id__,
-                provider_resource_id=endpoint.name,
-                identifier=endpoint.name,
-            )
-            if endpoint.task == "llm/v1/chat":
-                model.model_type = ModelType.llm  # this is redundant, but informative
-            elif endpoint.task == "llm/v1/embeddings":
-                if endpoint.name not in self.embedding_model_metadata:
-                    logger.warning(f"No metadata information available for embedding model {endpoint.name}, skipping.")
-                    continue
-                model.model_type = ModelType.embedding
-                model.metadata = self.embedding_model_metadata[endpoint.name]
-            else:
-                logger.warning(f"Unknown model type, skipping: {endpoint}")
-                continue
-
-            self._model_cache[endpoint.name] = model
-
-        return list(self._model_cache.values())
-
-    async def should_refresh_models(self) -> bool:
-        return False
--- a/llama_stack/providers/remote/inference/fireworks/init.py
+++ b/llama_stack/providers/remote/inference/fireworks/init.py
@ -17,6 +17,6 @@ async def get_adapter_impl(config: FireworksImplConfig, _deps):
    from .fireworks import FireworksInferenceAdapter

    assert isinstance(config, FireworksImplConfig), f"Unexpected config type: {type(config)}"
-    impl = FireworksInferenceAdapter(config)
+    impl = FireworksInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -5,124 +5,26 @@
 # the root directory of this source tree.


-from fireworks.client import Fireworks
-
-from llama_stack.apis.inference import (
-    ChatCompletionRequest,
-    Inference,
-    LogProbConfig,
-    ResponseFormat,
-    ResponseFormatType,
-    SamplingParams,
-)
-from llama_stack.core.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
-from llama_stack.providers.utils.inference.model_registry import (
-    ModelRegistryHelper,
-)
-from llama_stack.providers.utils.inference.openai_compat import (
-    convert_message_to_openai_dict,
-    get_sampling_options,
-)
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-    request_has_media,
-)

 from .config import FireworksImplConfig

 logger = get_logger(name=__name__, category="inference::fireworks")


-class FireworksInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData):
-    embedding_model_metadata = {
+class FireworksInferenceAdapter(OpenAIMixin):
+    config: FireworksImplConfig
+
+    embedding_model_metadata: dict[str, dict[str, int]] = {
        "nomic-ai/nomic-embed-text-v1.5": {"embedding_dimension": 768, "context_length": 8192},
        "accounts/fireworks/models/qwen3-embedding-8b": {"embedding_dimension": 4096, "context_length": 40960},
    }

-    def __init__(self, config: FireworksImplConfig) -> None:
-        ModelRegistryHelper.__init__(self)
-        self.config = config
-        self.allowed_models = config.allowed_models
-
-    async def initialize(self) -> None:
-        pass
-
-    async def shutdown(self) -> None:
-        pass
+    provider_data_api_key_field: str = "fireworks_api_key"

    def get_api_key(self) -> str:
-        config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
-        if config_api_key:
-            return config_api_key
-        else:
-            provider_data = self.get_request_provider_data()
-            if provider_data is None or not provider_data.fireworks_api_key:
-                raise ValueError(
-                    'Pass Fireworks API Key in the header X-LlamaStack-Provider-Data as { "fireworks_api_key": <your api key>}'
-                )
-            return provider_data.fireworks_api_key
+        return self.config.api_key.get_secret_value() if self.config.api_key else None  # type: ignore[return-value]

    def get_base_url(self) -> str:
        return "https://api.fireworks.ai/inference/v1"
-
-    def _get_client(self) -> Fireworks:
-        fireworks_api_key = self.get_api_key()
-        return Fireworks(api_key=fireworks_api_key)
-
-    def _build_options(
-        self,
-        sampling_params: SamplingParams | None,
-        fmt: ResponseFormat | None,
-        logprobs: LogProbConfig | None,
-    ) -> dict:
-        options = get_sampling_options(sampling_params)
-        options.setdefault("max_tokens", 512)
-
-        if fmt:
-            if fmt.type == ResponseFormatType.json_schema.value:
-                options["response_format"] = {
-                    "type": "json_object",
-                    "schema": fmt.json_schema,
-                }
-            elif fmt.type == ResponseFormatType.grammar.value:
-                options["response_format"] = {
-                    "type": "grammar",
-                    "grammar": fmt.bnf,
-                }
-            else:
-                raise ValueError(f"Unknown response format {fmt.type}")
-
-        if logprobs and logprobs.top_k:
-            options["logprobs"] = logprobs.top_k
-            if options["logprobs"] <= 0 or options["logprobs"] >= 5:
-                raise ValueError("Required range: 0 < top_k < 5")
-
-        return options
-
-    async def _get_params(self, request: ChatCompletionRequest) -> dict:
-        input_dict = {}
-        media_present = request_has_media(request)
-
-        llama_model = self.get_llama_model(request.model)
-        # TODO: tools are never added to the request, so we need to add them here
-        if media_present or not llama_model:
-            input_dict["messages"] = [await convert_message_to_openai_dict(m, download=True) for m in request.messages]
-        else:
-            input_dict["prompt"] = await chat_completion_request_to_prompt(request, llama_model)
-
-        # Fireworks always prepends with BOS
-        if "prompt" in input_dict:
-            if input_dict["prompt"].startswith("<|begin_of_text|>"):
-                input_dict["prompt"] = input_dict["prompt"][len("<|begin_of_text|>") :]
-
-        params = {
-            "model": request.model,
-            **input_dict,
-            "stream": bool(request.stream),
-            **self._build_options(request.sampling_params, request.response_format, request.logprobs),
-        }
-        logger.debug(f"params to fireworks: {params}")
-
-        return params
--- a/llama_stack/providers/remote/inference/gemini/init.py
+++ b/llama_stack/providers/remote/inference/gemini/init.py
@ -10,6 +10,6 @@ from .config import GeminiConfig
 async def get_adapter_impl(config: GeminiConfig, _deps):
    from .gemini import GeminiInferenceAdapter

-    impl = GeminiInferenceAdapter(config)
+    impl = GeminiInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/llama_stack/providers/remote/inference/gemini/gemini.py
@ -4,33 +4,21 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import GeminiConfig


-class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
-    embedding_model_metadata = {
+class GeminiInferenceAdapter(OpenAIMixin):
+    config: GeminiConfig
+
+    provider_data_api_key_field: str = "gemini_api_key"
+    embedding_model_metadata: dict[str, dict[str, int]] = {
        "text-embedding-004": {"embedding_dimension": 768, "context_length": 2048},
    }

-    def __init__(self, config: GeminiConfig) -> None:
-        LiteLLMOpenAIMixin.__init__(
-            self,
-            litellm_provider_name="gemini",
-            api_key_from_config=config.api_key,
-            provider_data_api_key_field="gemini_api_key",
-        )
-        self.config = config
-
-    get_api_key = LiteLLMOpenAIMixin.get_api_key
+    def get_api_key(self) -> str:
+        return self.config.api_key or ""

    def get_base_url(self):
        return "https://generativelanguage.googleapis.com/v1beta/openai/"
-
-    async def initialize(self) -> None:
-        await super().initialize()
-
-    async def shutdown(self) -> None:
-        await super().shutdown()
--- a/llama_stack/providers/remote/inference/groq/init.py
+++ b/llama_stack/providers/remote/inference/groq/init.py
@ -11,5 +11,5 @@ async def get_adapter_impl(config: GroqConfig, _deps):
    # import dynamically so the import is used only when it is needed
    from .groq import GroqInferenceAdapter

-    adapter = GroqInferenceAdapter(config)
+    adapter = GroqInferenceAdapter(config=config)
    return adapter
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -6,30 +6,16 @@


 from llama_stack.providers.remote.inference.groq.config import GroqConfig
-from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin


-class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
-    _config: GroqConfig
+class GroqInferenceAdapter(OpenAIMixin):
+    config: GroqConfig

-    def __init__(self, config: GroqConfig):
-        LiteLLMOpenAIMixin.__init__(
-            self,
-            litellm_provider_name="groq",
-            api_key_from_config=config.api_key,
-            provider_data_api_key_field="groq_api_key",
-        )
-        self.config = config
+    provider_data_api_key_field: str = "groq_api_key"

-    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
-    get_api_key = LiteLLMOpenAIMixin.get_api_key
+    def get_api_key(self) -> str:
+        return self.config.api_key or ""

    def get_base_url(self) -> str:
        return f"{self.config.url}/openai/v1"
-
-    async def initialize(self):
-        await super().initialize()
-
-    async def shutdown(self):
-        await super().shutdown()
--- a/llama_stack/providers/remote/inference/llama_openai_compat/init.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/init.py
@ -4,14 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from llama_stack.apis.inference import InferenceProvider
-
 from .config import LlamaCompatConfig


-async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> InferenceProvider:
+async def get_adapter_impl(config: LlamaCompatConfig, _deps):
    # import dynamically so the import is used only when it is needed
    from .llama import LlamaCompatInferenceAdapter

-    adapter = LlamaCompatInferenceAdapter(config)
+    adapter = LlamaCompatInferenceAdapter(config=config)
    return adapter
--- a/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
+++ b/llama_stack/providers/remote/inference/llama_openai_compat/llama.py
@ -3,40 +3,26 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+from typing import Any
+
+from llama_stack.apis.inference.inference import OpenAICompletion, OpenAIEmbeddingsResponse
 from llama_stack.log import get_logger
 from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig
-from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 logger = get_logger(name=__name__, category="inference::llama_openai_compat")


-class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
+class LlamaCompatInferenceAdapter(OpenAIMixin):
+    config: LlamaCompatConfig
+
+    provider_data_api_key_field: str = "llama_api_key"
    """
    Llama API Inference Adapter for Llama Stack.
-
-    Note: The inheritance order is important here. OpenAIMixin must come before
-    LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
-    is used instead of ModelRegistryHelper.check_model_availability().
-
-    - OpenAIMixin.check_model_availability() queries the Llama API to check if a model exists
-    - ModelRegistryHelper.check_model_availability() (inherited by LiteLLMOpenAIMixin) just returns False and shows a warning
    """

-    _config: LlamaCompatConfig
-
-    def __init__(self, config: LlamaCompatConfig):
-        LiteLLMOpenAIMixin.__init__(
-            self,
-            litellm_provider_name="meta_llama",
-            api_key_from_config=config.api_key,
-            provider_data_api_key_field="llama_api_key",
-            openai_compat_api_base=config.openai_compat_api_base,
-        )
-        self.config = config
-
-    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
-    get_api_key = LiteLLMOpenAIMixin.get_api_key
+    def get_api_key(self) -> str:
+        return self.config.api_key or ""

    def get_base_url(self) -> str:
        """
@ -46,8 +32,37 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        """
        return self.config.openai_compat_api_base

-    async def initialize(self):
-        await super().initialize()
+    async def openai_completion(
+        self,
+        model: str,
+        prompt: str | list[str] | list[int] | list[list[int]],
+        best_of: int | None = None,
+        echo: bool | None = None,
+        frequency_penalty: float | None = None,
+        logit_bias: dict[str, float] | None = None,
+        logprobs: bool | None = None,
+        max_tokens: int | None = None,
+        n: int | None = None,
+        presence_penalty: float | None = None,
+        seed: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = None,
+        stream_options: dict[str, Any] | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        user: str | None = None,
+        guided_choice: list[str] | None = None,
+        prompt_logprobs: int | None = None,
+        suffix: str | None = None,
+    ) -> OpenAICompletion:
+        raise NotImplementedError()

-    async def shutdown(self):
-        await super().shutdown()
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        raise NotImplementedError()
--- a/llama_stack/providers/remote/inference/nvidia/init.py
+++ b/llama_stack/providers/remote/inference/nvidia/init.py
@ -15,7 +15,8 @@ async def get_adapter_impl(config: NVIDIAConfig, _deps) -> Inference:

    if not isinstance(config, NVIDIAConfig):
        raise RuntimeError(f"Unexpected config type: {type(config)}")
-    adapter = NVIDIAInferenceAdapter(config)
+    adapter = NVIDIAInferenceAdapter(config=config)
+    await adapter.initialize()
    return adapter


--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -8,7 +8,6 @@
 from openai import NOT_GIVEN

 from llama_stack.apis.inference import (
-    Inference,
    OpenAIEmbeddingData,
    OpenAIEmbeddingsResponse,
    OpenAIEmbeddingUsage,
@ -22,7 +21,9 @@ from .utils import _is_nvidia_hosted
 logger = get_logger(name=__name__, category="inference::nvidia")


-class NVIDIAInferenceAdapter(OpenAIMixin, Inference):
+class NVIDIAInferenceAdapter(OpenAIMixin):
+    config: NVIDIAConfig
+
    """
    NVIDIA Inference Adapter for Llama Stack.

@ -37,32 +38,21 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference):
    """

    # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
-    embedding_model_metadata = {
+    embedding_model_metadata: dict[str, dict[str, int]] = {
        "nvidia/llama-3.2-nv-embedqa-1b-v2": {"embedding_dimension": 2048, "context_length": 8192},
        "nvidia/nv-embedqa-e5-v5": {"embedding_dimension": 512, "context_length": 1024},
        "nvidia/nv-embedqa-mistral-7b-v2": {"embedding_dimension": 512, "context_length": 4096},
        "snowflake/arctic-embed-l": {"embedding_dimension": 512, "context_length": 1024},
    }

-    def __init__(self, config: NVIDIAConfig) -> None:
-        logger.info(f"Initializing NVIDIAInferenceAdapter({config.url})...")
+    async def initialize(self) -> None:
+        logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.url})...")

-        if _is_nvidia_hosted(config):
-            if not config.api_key:
+        if _is_nvidia_hosted(self.config):
+            if not self.config.api_key:
                raise RuntimeError(
                    "API key is required for hosted NVIDIA NIM. Either provide an API key or use a self-hosted NIM."
                )
-        # elif self._config.api_key:
-        #
-        # we don't raise this warning because a user may have deployed their
-        # self-hosted NIM with an API key requirement.
-        #
-        #     warnings.warn(
-        #         "API key is not required for self-hosted NVIDIA NIM. "
-        #         "Consider removing the api_key from the configuration."
-        #     )
-
-        self._config = config

    def get_api_key(self) -> str:
        """
@ -70,7 +60,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference):

        :return: The NVIDIA API key
        """
-        return self._config.api_key.get_secret_value() if self._config.api_key else "NO KEY"
+        return self.config.api_key.get_secret_value() if self.config.api_key else "NO KEY"

    def get_base_url(self) -> str:
        """
@ -78,7 +68,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference):

        :return: The NVIDIA API base URL
        """
-        return f"{self._config.url}/v1" if self._config.append_api_version else self._config.url
+        return f"{self.config.url}/v1" if self.config.append_api_version else self.config.url

    async def openai_embeddings(
        self,
--- a/llama_stack/providers/remote/inference/ollama/init.py
+++ b/llama_stack/providers/remote/inference/ollama/init.py
@ -10,6 +10,6 @@ from .config import OllamaImplConfig
 async def get_adapter_impl(config: OllamaImplConfig, _deps):
    from .ollama import OllamaInferenceAdapter

-    impl = OllamaInferenceAdapter(config)
+    impl = OllamaInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -6,58 +6,29 @@


 import asyncio
-from typing import Any

 from ollama import AsyncClient as AsyncOllamaClient

-from llama_stack.apis.common.content_types import (
-    ImageContentItem,
-    TextContentItem,
-)
 from llama_stack.apis.common.errors import UnsupportedModelError
-from llama_stack.apis.inference import (
-    ChatCompletionRequest,
-    GrammarResponseFormat,
-    InferenceProvider,
-    JsonSchemaResponseFormat,
-    Message,
-)
 from llama_stack.apis.models import Model
 from llama_stack.log import get_logger
-from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.datatypes import (
    HealthResponse,
    HealthStatus,
-    ModelsProtocolPrivate,
 )
 from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
-from llama_stack.providers.utils.inference.model_registry import (
-    ModelRegistryHelper,
-    build_hf_repo_model_entry,
-)
-from llama_stack.providers.utils.inference.openai_compat import (
-    get_sampling_options,
-)
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-    convert_image_content_to_url,
-    request_has_media,
-)

 logger = get_logger(name=__name__, category="inference::ollama")


-class OllamaInferenceAdapter(
-    OpenAIMixin,
-    ModelRegistryHelper,
-    InferenceProvider,
-    ModelsProtocolPrivate,
-):
+class OllamaInferenceAdapter(OpenAIMixin):
+    config: OllamaImplConfig
+
    # automatically set by the resolver when instantiating the provider
    __provider_id__: str

-    embedding_model_metadata = {
+    embedding_model_metadata: dict[str, dict[str, int]] = {
        "all-minilm:l6-v2": {
            "embedding_dimension": 384,
            "context_length": 512,
@ -76,29 +47,8 @@ class OllamaInferenceAdapter(
        },
    }

-    def __init__(self, config: OllamaImplConfig) -> None:
-        # TODO: remove ModelRegistryHelper.__init__ when completion and
-        #       chat_completion are. this exists to satisfy the input /
-        #       output processing for llama models. specifically,
-        #       tool_calling is handled by raw template processing,
-        #       instead of using the /api/chat endpoint w/ tools=...
-        ModelRegistryHelper.__init__(
-            self,
-            model_entries=[
-                build_hf_repo_model_entry(
-                    "llama3.2:3b-instruct-fp16",
-                    CoreModelId.llama3_2_3b_instruct.value,
-                ),
-                build_hf_repo_model_entry(
-                    "llama-guard3:1b",
-                    CoreModelId.llama_guard_3_1b.value,
-                ),
-            ],
-        )
-        self.config = config
-        # Ollama does not support image urls, so we need to download the image and convert it to base64
-        self.download_images = True
-        self._clients: dict[asyncio.AbstractEventLoop, AsyncOllamaClient] = {}
+    download_images: bool = True
+    _clients: dict[asyncio.AbstractEventLoop, AsyncOllamaClient] = {}

    @property
    def ollama_client(self) -> AsyncOllamaClient:
@ -142,50 +92,6 @@ class OllamaInferenceAdapter(
    async def shutdown(self) -> None:
        self._clients.clear()

-    async def _get_model(self, model_id: str) -> Model:
-        if not self.model_store:
-            raise ValueError("Model store not set")
-        return await self.model_store.get_model(model_id)
-
-    async def _get_params(self, request: ChatCompletionRequest) -> dict:
-        sampling_options = get_sampling_options(request.sampling_params)
-        # This is needed since the Ollama API expects num_predict to be set
-        # for early truncation instead of max_tokens.
-        if sampling_options.get("max_tokens") is not None:
-            sampling_options["num_predict"] = sampling_options["max_tokens"]
-
-        input_dict: dict[str, Any] = {}
-        media_present = request_has_media(request)
-        llama_model = self.get_llama_model(request.model)
-        if media_present or not llama_model:
-            contents = [await convert_message_to_openai_dict_for_ollama(m) for m in request.messages]
-            # flatten the list of lists
-            input_dict["messages"] = [item for sublist in contents for item in sublist]
-        else:
-            input_dict["raw"] = True
-            input_dict["prompt"] = await chat_completion_request_to_prompt(
-                request,
-                llama_model,
-            )
-
-        if fmt := request.response_format:
-            if isinstance(fmt, JsonSchemaResponseFormat):
-                input_dict["format"] = fmt.json_schema
-            elif isinstance(fmt, GrammarResponseFormat):
-                raise NotImplementedError("Grammar response format is not supported")
-            else:
-                raise ValueError(f"Unknown response format type: {fmt.type}")
-
-        params = {
-            "model": request.model,
-            **input_dict,
-            "options": sampling_options,
-            "stream": request.stream,
-        }
-        logger.debug(f"params to ollama: {params}")
-
-        return params
-
    async def register_model(self, model: Model) -> Model:
        if await self.check_model_availability(model.provider_model_id):
            return model
@ -197,24 +103,3 @@ class OllamaInferenceAdapter(
            return model

        raise UnsupportedModelError(model.provider_model_id, list(self._model_cache.keys()))
-
-
-async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
-    async def _convert_content(content) -> dict:
-        if isinstance(content, ImageContentItem):
-            return {
-                "role": message.role,
-                "images": [await convert_image_content_to_url(content, download=True, include_format=False)],
-            }
-        else:
-            text = content.text if isinstance(content, TextContentItem) else content
-            assert isinstance(text, str)
-            return {
-                "role": message.role,
-                "content": text,
-            }
-
-    if isinstance(message.content, list):
-        return [await _convert_content(c) for c in message.content]
-    else:
-        return [await _convert_content(message.content)]
--- a/llama_stack/providers/remote/inference/openai/init.py
+++ b/llama_stack/providers/remote/inference/openai/init.py
@ -10,6 +10,6 @@ from .config import OpenAIConfig
 async def get_adapter_impl(config: OpenAIConfig, _deps):
    from .openai import OpenAIInferenceAdapter

-    impl = OpenAIInferenceAdapter(config)
+    impl = OpenAIInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/openai/openai.py
+++ b/llama_stack/providers/remote/inference/openai/openai.py
@ -5,7 +5,6 @@
 # the root directory of this source tree.

 from llama_stack.log import get_logger
-from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import OpenAIConfig
@ -14,52 +13,24 @@ logger = get_logger(name=__name__, category="inference::openai")


 #
-# This OpenAI adapter implements Inference methods using two mixins -
+# This OpenAI adapter implements Inference methods using OpenAIMixin
 #
-# | Inference Method           | Implementation Source    |
-# |----------------------------|--------------------------|
-# | completion                 | LiteLLMOpenAIMixin       |
-# | chat_completion            | LiteLLMOpenAIMixin       |
-# | embedding                  | LiteLLMOpenAIMixin       |
-# | openai_completion          | OpenAIMixin              |
-# | openai_chat_completion     | OpenAIMixin              |
-# | openai_embeddings          | OpenAIMixin              |
-#
-class OpenAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
+class OpenAIInferenceAdapter(OpenAIMixin):
    """
    OpenAI Inference Adapter for Llama Stack.
-
-    Note: The inheritance order is important here. OpenAIMixin must come before
-    LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
-    is used instead of ModelRegistryHelper.check_model_availability().
-
-    - OpenAIMixin.check_model_availability() queries the OpenAI API to check if a model exists
-    - ModelRegistryHelper.check_model_availability() (inherited by LiteLLMOpenAIMixin) just returns False and shows a warning
    """

-    embedding_model_metadata = {
+    config: OpenAIConfig
+
+    provider_data_api_key_field: str = "openai_api_key"
+
+    embedding_model_metadata: dict[str, dict[str, int]] = {
        "text-embedding-3-small": {"embedding_dimension": 1536, "context_length": 8192},
        "text-embedding-3-large": {"embedding_dimension": 3072, "context_length": 8192},
    }

-    def __init__(self, config: OpenAIConfig) -> None:
-        LiteLLMOpenAIMixin.__init__(
-            self,
-            litellm_provider_name="openai",
-            api_key_from_config=config.api_key,
-            provider_data_api_key_field="openai_api_key",
-        )
-        self.config = config
-        # we set is_openai_compat so users can use the canonical
-        # openai model names like "gpt-4" or "gpt-3.5-turbo"
-        # and the model name will be translated to litellm's
-        # "openai/gpt-4" or "openai/gpt-3.5-turbo" transparently.
-        # if we do not set this, users will be exposed to the
-        # litellm specific model names, an abstraction leak.
-        self.is_openai_compat = True
-
-    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
-    get_api_key = LiteLLMOpenAIMixin.get_api_key
+    def get_api_key(self) -> str:
+        return self.config.api_key or ""

    def get_base_url(self) -> str:
        """
@ -68,9 +39,3 @@ class OpenAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        Returns the OpenAI API base URL from the configuration.
        """
        return self.config.base_url
-
-    async def initialize(self) -> None:
-        await super().initialize()
-
-    async def shutdown(self) -> None:
-        await super().shutdown()
--- a/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -31,12 +31,6 @@ class PassthroughInferenceAdapter(Inference):
        ModelRegistryHelper.__init__(self)
        self.config = config

-    async def initialize(self) -> None:
-        pass
-
-    async def shutdown(self) -> None:
-        pass
-
    async def unregister_model(self, model_id: str) -> None:
        pass

--- a/llama_stack/providers/remote/inference/runpod/runpod.py
+++ b/llama_stack/providers/remote/inference/runpod/runpod.py
@ -53,12 +53,6 @@ class RunpodInferenceAdapter(
        ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS)
        self.config = config

-    async def initialize(self) -> None:
-        return
-
-    async def shutdown(self) -> None:
-        pass
-
    def _get_params(self, request: ChatCompletionRequest) -> dict:
        return {
            "model": self.map_to_provider_model(request.model),
--- a/llama_stack/providers/remote/inference/sambanova/init.py
+++ b/llama_stack/providers/remote/inference/sambanova/init.py
@ -11,6 +11,6 @@ async def get_adapter_impl(config: SambaNovaImplConfig, _deps):
    from .sambanova import SambaNovaInferenceAdapter

    assert isinstance(config, SambaNovaImplConfig), f"Unexpected config type: {type(config)}"
-    impl = SambaNovaInferenceAdapter(config)
+    impl = SambaNovaInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -5,39 +5,22 @@
 # the root directory of this source tree.


-from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import SambaNovaImplConfig


-class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
+class SambaNovaInferenceAdapter(OpenAIMixin):
+    config: SambaNovaImplConfig
+
+    provider_data_api_key_field: str = "sambanova_api_key"
+    download_images: bool = True  # SambaNova does not support image downloads server-size, perform them on the client
    """
    SambaNova Inference Adapter for Llama Stack.
-
-    Note: The inheritance order is important here. OpenAIMixin must come before
-    LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
-    is used instead of LiteLLMOpenAIMixin.check_model_availability().
-
-    - OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists
-    - LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM
    """

-    def __init__(self, config: SambaNovaImplConfig):
-        self.config = config
-        self.environment_available_models: list[str] = []
-        LiteLLMOpenAIMixin.__init__(
-            self,
-            litellm_provider_name="sambanova",
-            api_key_from_config=self.config.api_key.get_secret_value() if self.config.api_key else None,
-            provider_data_api_key_field="sambanova_api_key",
-            openai_compat_api_base=self.config.url,
-            download_images=True,  # SambaNova requires base64 image encoding
-            json_schema_strict=False,  # SambaNova doesn't support strict=True yet
-        )
-
-    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
-    get_api_key = LiteLLMOpenAIMixin.get_api_key
+    def get_api_key(self) -> str:
+        return self.config.api_key.get_secret_value() if self.config.api_key else ""

    def get_base_url(self) -> str:
        """
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -5,53 +5,21 @@
 # the root directory of this source tree.


+from collections.abc import Iterable
+
 from huggingface_hub import AsyncInferenceClient, HfApi
 from pydantic import SecretStr

-from llama_stack.apis.inference import (
-    ChatCompletionRequest,
-    Inference,
-    OpenAIEmbeddingsResponse,
-    ResponseFormat,
-    ResponseFormatType,
-    SamplingParams,
-)
-from llama_stack.apis.models import Model
-from llama_stack.apis.models.models import ModelType
+from llama_stack.apis.inference import OpenAIEmbeddingsResponse
 from llama_stack.log import get_logger
-from llama_stack.models.llama.sku_list import all_registered_models
-from llama_stack.providers.utils.inference.model_registry import (
-    ModelRegistryHelper,
-    build_hf_repo_model_entry,
-)
-from llama_stack.providers.utils.inference.openai_compat import (
-    get_sampling_options,
-)
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_model_input_info,
-)

 from .config import InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImplConfig

 log = get_logger(name=__name__, category="inference::tgi")


-def build_hf_repo_model_entries():
-    return [
-        build_hf_repo_model_entry(
-            model.huggingface_repo,
-            model.descriptor(),
-        )
-        for model in all_registered_models()
-        if model.huggingface_repo
-    ]
-
-
-class _HfAdapter(
-    OpenAIMixin,
-    Inference,
-):
+class _HfAdapter(OpenAIMixin):
    url: str
    api_key: SecretStr

@ -61,90 +29,14 @@ class _HfAdapter(

    overwrite_completion_id = True  # TGI always returns id=""

-    def __init__(self) -> None:
-        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
-        self.huggingface_repo_to_llama_model_id = {
-            model.huggingface_repo: model.descriptor() for model in all_registered_models() if model.huggingface_repo
-        }
-
    def get_api_key(self):
        return self.api_key.get_secret_value()

    def get_base_url(self):
        return self.url

-    async def shutdown(self) -> None:
-        pass
-
-    async def list_models(self) -> list[Model] | None:
-        models = []
-        async for model in self.client.models.list():
-            models.append(
-                Model(
-                    identifier=model.id,
-                    provider_resource_id=model.id,
-                    provider_id=self.__provider_id__,
-                    metadata={},
-                    model_type=ModelType.llm,
-                )
-            )
-        return models
-
-    async def register_model(self, model: Model) -> Model:
-        if model.provider_resource_id != self.model_id:
-            raise ValueError(
-                f"Model {model.provider_resource_id} does not match the model {self.model_id} served by TGI."
-            )
-        return model
-
-    async def unregister_model(self, model_id: str) -> None:
-        pass
-
-    def _get_max_new_tokens(self, sampling_params, input_tokens):
-        return min(
-            sampling_params.max_tokens or (self.max_tokens - input_tokens),
-            self.max_tokens - input_tokens - 1,
-        )
-
-    def _build_options(
-        self,
-        sampling_params: SamplingParams | None = None,
-        fmt: ResponseFormat = None,
-    ):
-        options = get_sampling_options(sampling_params)
-        # TGI does not support temperature=0 when using greedy sampling
-        # We set it to 1e-3 instead, anything lower outputs garbage from TGI
-        # We can use top_p sampling strategy to specify lower temperature
-        if abs(options["temperature"]) < 1e-10:
-            options["temperature"] = 1e-3
-
-        # delete key "max_tokens" from options since its not supported by the API
-        options.pop("max_tokens", None)
-        if fmt:
-            if fmt.type == ResponseFormatType.json_schema.value:
-                options["grammar"] = {
-                    "type": "json",
-                    "value": fmt.json_schema,
-                }
-            elif fmt.type == ResponseFormatType.grammar.value:
-                raise ValueError("Grammar response format not supported yet")
-            else:
-                raise ValueError(f"Unexpected response format: {fmt.type}")
-
-        return options
-
-    async def _get_params(self, request: ChatCompletionRequest) -> dict:
-        prompt, input_tokens = await chat_completion_request_to_model_input_info(
-            request, self.register_helper.get_llama_model(request.model)
-        )
-        return dict(
-            prompt=prompt,
-            stream=request.stream,
-            details=True,
-            max_new_tokens=self._get_max_new_tokens(request.sampling_params, input_tokens),
-            stop_sequences=["<|eom_id|>", "<|eot_id|>"],
-            **self._build_options(request.sampling_params, request.response_format),
-        )
+    async def list_provider_model_ids(self) -> Iterable[str]:
+        return [self.model_id]

    async def openai_embeddings(
        self,
--- a/llama_stack/providers/remote/inference/together/init.py
+++ b/llama_stack/providers/remote/inference/together/init.py
@ -17,6 +17,6 @@ async def get_adapter_impl(config: TogetherImplConfig, _deps):
    from .together import TogetherInferenceAdapter

    assert isinstance(config, TogetherImplConfig), f"Unexpected config type: {type(config)}"
-    impl = TogetherInferenceAdapter(config)
+    impl = TogetherInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -5,41 +5,29 @@
 # the root directory of this source tree.


-from openai import AsyncOpenAI
+from collections.abc import Iterable
+
 from together import AsyncTogether
 from together.constants import BASE_URL

 from llama_stack.apis.inference import (
-    ChatCompletionRequest,
-    Inference,
-    LogProbConfig,
    OpenAIEmbeddingsResponse,
-    ResponseFormat,
-    ResponseFormatType,
-    SamplingParams,
 )
 from llama_stack.apis.inference.inference import OpenAIEmbeddingUsage
-from llama_stack.apis.models import Model, ModelType
+from llama_stack.apis.models import Model
 from llama_stack.core.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
-from llama_stack.providers.utils.inference.openai_compat import (
-    convert_message_to_openai_dict,
-    get_sampling_options,
-)
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
-from llama_stack.providers.utils.inference.prompt_adapter import (
-    chat_completion_request_to_prompt,
-    request_has_media,
-)

 from .config import TogetherImplConfig

 logger = get_logger(name=__name__, category="inference::together")


-class TogetherInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData):
-    embedding_model_metadata = {
+class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
+    config: TogetherImplConfig
+
+    embedding_model_metadata: dict[str, dict[str, int]] = {
        "togethercomputer/m2-bert-80M-32k-retrieval": {"embedding_dimension": 768, "context_length": 32768},
        "BAAI/bge-large-en-v1.5": {"embedding_dimension": 1024, "context_length": 512},
        "BAAI/bge-base-en-v1.5": {"embedding_dimension": 768, "context_length": 512},
@ -47,24 +35,16 @@ class TogetherInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData)
        "intfloat/multilingual-e5-large-instruct": {"embedding_dimension": 1024, "context_length": 512},
    }

-    def __init__(self, config: TogetherImplConfig) -> None:
-        ModelRegistryHelper.__init__(self)
-        self.config = config
-        self.allowed_models = config.allowed_models
-        self._model_cache: dict[str, Model] = {}
+    _model_cache: dict[str, Model] = {}
+
+    provider_data_api_key_field: str = "together_api_key"

    def get_api_key(self):
-        return self.config.api_key.get_secret_value()
+        return self.config.api_key.get_secret_value() if self.config.api_key else None

    def get_base_url(self):
        return BASE_URL

-    async def initialize(self) -> None:
-        pass
-
-    async def shutdown(self) -> None:
-        pass
-
    def _get_client(self) -> AsyncTogether:
        together_api_key = None
        config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
@ -79,90 +59,13 @@ class TogetherInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData)
            together_api_key = provider_data.together_api_key
        return AsyncTogether(api_key=together_api_key)

-    def _get_openai_client(self) -> AsyncOpenAI:
-        together_client = self._get_client().client
-        return AsyncOpenAI(
-            base_url=together_client.base_url,
-            api_key=together_client.api_key,
-        )
-
-    def _build_options(
-        self,
-        sampling_params: SamplingParams | None,
-        logprobs: LogProbConfig | None,
-        fmt: ResponseFormat,
-    ) -> dict:
-        options = get_sampling_options(sampling_params)
-        if fmt:
-            if fmt.type == ResponseFormatType.json_schema.value:
-                options["response_format"] = {
-                    "type": "json_object",
-                    "schema": fmt.json_schema,
-                }
-            elif fmt.type == ResponseFormatType.grammar.value:
-                raise NotImplementedError("Grammar response format not supported yet")
-            else:
-                raise ValueError(f"Unknown response format {fmt.type}")
-
-        if logprobs and logprobs.top_k:
-            if logprobs.top_k != 1:
-                raise ValueError(
-                    f"Unsupported value: Together only supports logprobs top_k=1. {logprobs.top_k} was provided",
-                )
-            options["logprobs"] = 1
-
-        return options
-
-    async def _get_params(self, request: ChatCompletionRequest) -> dict:
-        input_dict = {}
-        media_present = request_has_media(request)
-        llama_model = self.get_llama_model(request.model)
-        if media_present or not llama_model:
-            input_dict["messages"] = [await convert_message_to_openai_dict(m) for m in request.messages]
-        else:
-            input_dict["prompt"] = await chat_completion_request_to_prompt(request, llama_model)
-
-        params = {
-            "model": request.model,
-            **input_dict,
-            "stream": request.stream,
-            **self._build_options(request.sampling_params, request.logprobs, request.response_format),
-        }
-        logger.debug(f"params to together: {params}")
-        return params
-
-    async def list_models(self) -> list[Model] | None:
-        self._model_cache = {}
+    async def list_provider_model_ids(self) -> Iterable[str]:
        # Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client
-        for m in await self._get_client().models.list():
-            if m.type == "embedding":
-                if m.id not in self.embedding_model_metadata:
-                    logger.warning(f"Unknown embedding dimension for model {m.id}, skipping.")
-                    continue
-                metadata = self.embedding_model_metadata[m.id]
-                self._model_cache[m.id] = Model(
-                    provider_id=self.__provider_id__,
-                    provider_resource_id=m.id,
-                    identifier=m.id,
-                    model_type=ModelType.embedding,
-                    metadata=metadata,
-                )
-            else:
-                self._model_cache[m.id] = Model(
-                    provider_id=self.__provider_id__,
-                    provider_resource_id=m.id,
-                    identifier=m.id,
-                    model_type=ModelType.llm,
-                )
-
-        return self._model_cache.values()
+        return [m.id for m in await self._get_client().models.list()]

    async def should_refresh_models(self) -> bool:
        return True

-    async def check_model_availability(self, model):
-        return model in self._model_cache
-
    async def openai_embeddings(
        self,
        model: str,
@ -203,4 +106,4 @@ class TogetherInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData)
            )
            response.usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)

-        return response
+        return response  # type: ignore[no-any-return]
--- a/llama_stack/providers/remote/inference/vertexai/init.py
+++ b/llama_stack/providers/remote/inference/vertexai/init.py
@ -10,6 +10,6 @@ from .config import VertexAIConfig
 async def get_adapter_impl(config: VertexAIConfig, _deps):
    from .vertexai import VertexAIInferenceAdapter

-    impl = VertexAIInferenceAdapter(config)
+    impl = VertexAIInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/vertexai/vertexai.py
+++ b/llama_stack/providers/remote/inference/vertexai/vertexai.py
@ -4,29 +4,19 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Any

 import google.auth.transport.requests
 from google.auth import default

-from llama_stack.apis.inference import ChatCompletionRequest
-from llama_stack.providers.utils.inference.litellm_openai_mixin import (
-    LiteLLMOpenAIMixin,
-)
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import VertexAIConfig


-class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
-    def __init__(self, config: VertexAIConfig) -> None:
-        LiteLLMOpenAIMixin.__init__(
-            self,
-            litellm_provider_name="vertex_ai",
-            api_key_from_config=None,  # Vertex AI uses ADC, not API keys
-            provider_data_api_key_field="vertex_project",  # Use project for validation
-        )
-        self.config = config
+class VertexAIInferenceAdapter(OpenAIMixin):
+    config: VertexAIConfig
+
+    provider_data_api_key_field: str = "vertex_project"

    def get_api_key(self) -> str:
        """
@ -41,8 +31,7 @@ class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
            credentials.refresh(google.auth.transport.requests.Request())
            return str(credentials.token)
        except Exception:
-            # If we can't get credentials, return empty string to let LiteLLM handle it
-            # This allows the LiteLLM mixin to work with ADC directly
+            # If we can't get credentials, return empty string to let the env work with ADC directly
            return ""

    def get_base_url(self) -> str:
@ -53,23 +42,3 @@ class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
        Source: https://cloud.google.com/vertex-ai/generative-ai/docs/start/openai
        """
        return f"https://{self.config.location}-aiplatform.googleapis.com/v1/projects/{self.config.project}/locations/{self.config.location}/endpoints/openapi"
-
-    async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
-        # Get base parameters from parent
-        params = await super()._get_params(request)
-
-        # Add Vertex AI specific parameters
-        provider_data = self.get_request_provider_data()
-        if provider_data:
-            if getattr(provider_data, "vertex_project", None):
-                params["vertex_project"] = provider_data.vertex_project
-            if getattr(provider_data, "vertex_location", None):
-                params["vertex_location"] = provider_data.vertex_location
-        else:
-            params["vertex_project"] = self.config.project
-            params["vertex_location"] = self.config.location
-
-        # Remove api_key since Vertex AI uses ADC
-        params.pop("api_key", None)
-
-        return params
--- a/llama_stack/providers/remote/inference/vllm/init.py
+++ b/llama_stack/providers/remote/inference/vllm/init.py
@ -17,6 +17,6 @@ async def get_adapter_impl(config: VLLMInferenceAdapterConfig, _deps):
    from .vllm import VLLMInferenceAdapter

    assert isinstance(config, VLLMInferenceAdapterConfig), f"Unexpected config type: {type(config)}"
-    impl = VLLMInferenceAdapter(config)
+    impl = VLLMInferenceAdapter(config=config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -3,56 +3,26 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import json
-from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import AsyncIterator
 from typing import Any
 from urllib.parse import urljoin

 import httpx
-from openai import APIConnectionError
 from openai.types.chat.chat_completion_chunk import (
    ChatCompletionChunk as OpenAIChatCompletionChunk,
 )
+from pydantic import ConfigDict

-from llama_stack.apis.common.content_types import (
-    TextDelta,
-    ToolCallDelta,
-    ToolCallParseStatus,
-)
 from llama_stack.apis.inference import (
-    ChatCompletionRequest,
-    ChatCompletionResponseEvent,
-    ChatCompletionResponseEventType,
-    ChatCompletionResponseStreamChunk,
-    GrammarResponseFormat,
-    Inference,
-    JsonSchemaResponseFormat,
-    ModelStore,
    OpenAIChatCompletion,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    ToolChoice,
-    ToolDefinition,
 )
-from llama_stack.apis.models import Model, ModelType
 from llama_stack.log import get_logger
-from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
-from llama_stack.models.llama.sku_list import all_registered_models
 from llama_stack.providers.datatypes import (
    HealthResponse,
    HealthStatus,
-    ModelsProtocolPrivate,
-)
-from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
-from llama_stack.providers.utils.inference.model_registry import (
-    ModelRegistryHelper,
-    build_hf_repo_model_entry,
-)
-from llama_stack.providers.utils.inference.openai_compat import (
-    UnparseableToolCall,
-    convert_message_to_openai_dict,
-    convert_tool_call,
-    get_sampling_options,
 )
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

@ -61,210 +31,15 @@ from .config import VLLMInferenceAdapterConfig
 log = get_logger(name=__name__, category="inference::vllm")


-def build_hf_repo_model_entries():
-    return [
-        build_hf_repo_model_entry(
-            model.huggingface_repo,
-            model.descriptor(),
-        )
-        for model in all_registered_models()
-        if model.huggingface_repo
-    ]
+class VLLMInferenceAdapter(OpenAIMixin):
+    config: VLLMInferenceAdapterConfig

+    model_config = ConfigDict(arbitrary_types_allowed=True)

-def _convert_to_vllm_tool_calls_in_response(
-    tool_calls,
-) -> list[ToolCall]:
-    if not tool_calls:
-        return []
+    provider_data_api_key_field: str = "vllm_api_token"

-    return [
-        ToolCall(
-            call_id=call.id,
-            tool_name=call.function.name,
-            arguments=call.function.arguments,
-        )
-        for call in tool_calls
-    ]
-
-
-def _convert_to_vllm_tools_in_request(tools: list[ToolDefinition]) -> list[dict]:
-    compat_tools = []
-
-    for tool in tools:
-        # The tool.tool_name can be a str or a BuiltinTool enum. If
-        # it's the latter, convert to a string.
-        tool_name = tool.tool_name
-        if isinstance(tool_name, BuiltinTool):
-            tool_name = tool_name.value
-
-        compat_tool = {
-            "type": "function",
-            "function": {
-                "name": tool_name,
-                "description": tool.description,
-                "parameters": tool.input_schema
-                or {
-                    "type": "object",
-                    "properties": {},
-                    "required": [],
-                },
-            },
-        }
-
-        compat_tools.append(compat_tool)
-
-    return compat_tools
-
-
-def _convert_to_vllm_finish_reason(finish_reason: str) -> StopReason:
-    return {
-        "stop": StopReason.end_of_turn,
-        "length": StopReason.out_of_tokens,
-        "tool_calls": StopReason.end_of_message,
-    }.get(finish_reason, StopReason.end_of_turn)
-
-
-def _process_vllm_chat_completion_end_of_stream(
-    finish_reason: str | None,
-    last_chunk_content: str | None,
-    current_event_type: ChatCompletionResponseEventType,
-    tool_call_bufs: dict[str, UnparseableToolCall] | None = None,
-) -> list[OpenAIChatCompletionChunk]:
-    chunks = []
-
-    if finish_reason is not None:
-        stop_reason = _convert_to_vllm_finish_reason(finish_reason)
-    else:
-        stop_reason = StopReason.end_of_message
-
-    tool_call_bufs = tool_call_bufs or {}
-    for _index, tool_call_buf in sorted(tool_call_bufs.items()):
-        args_str = tool_call_buf.arguments or "{}"
-        try:
-            chunks.append(
-                ChatCompletionResponseStreamChunk(
-                    event=ChatCompletionResponseEvent(
-                        event_type=current_event_type,
-                        delta=ToolCallDelta(
-                            tool_call=ToolCall(
-                                call_id=tool_call_buf.call_id,
-                                tool_name=tool_call_buf.tool_name,
-                                arguments=args_str,
-                            ),
-                            parse_status=ToolCallParseStatus.succeeded,
-                        ),
-                    )
-                )
-            )
-        except Exception as e:
-            log.warning(f"Failed to parse tool call buffer arguments: {args_str} \nError: {e}")
-
-            chunks.append(
-                ChatCompletionResponseStreamChunk(
-                    event=ChatCompletionResponseEvent(
-                        event_type=ChatCompletionResponseEventType.progress,
-                        delta=ToolCallDelta(
-                            tool_call=str(tool_call_buf),
-                            parse_status=ToolCallParseStatus.failed,
-                        ),
-                    )
-                )
-            )
-
-    chunks.append(
-        ChatCompletionResponseStreamChunk(
-            event=ChatCompletionResponseEvent(
-                event_type=ChatCompletionResponseEventType.complete,
-                delta=TextDelta(text=last_chunk_content or ""),
-                logprobs=None,
-                stop_reason=stop_reason,
-            )
-        )
-    )
-
-    return chunks
-
-
-async def _process_vllm_chat_completion_stream_response(
-    stream: AsyncGenerator[OpenAIChatCompletionChunk, None],
-) -> AsyncGenerator:
-    yield ChatCompletionResponseStreamChunk(
-        event=ChatCompletionResponseEvent(
-            event_type=ChatCompletionResponseEventType.start,
-            delta=TextDelta(text=""),
-        )
-    )
-    event_type = ChatCompletionResponseEventType.progress
-    tool_call_bufs: dict[str, UnparseableToolCall] = {}
-    end_of_stream_processed = False
-
-    async for chunk in stream:
-        if not chunk.choices:
-            log.warning("vLLM failed to generation any completions - check the vLLM server logs for an error.")
-            return
-        choice = chunk.choices[0]
-        if choice.delta.tool_calls:
-            for delta_tool_call in choice.delta.tool_calls:
-                tool_call = convert_tool_call(delta_tool_call)
-                if delta_tool_call.index not in tool_call_bufs:
-                    tool_call_bufs[delta_tool_call.index] = UnparseableToolCall()
-                tool_call_buf = tool_call_bufs[delta_tool_call.index]
-                tool_call_buf.tool_name += str(tool_call.tool_name)
-                tool_call_buf.call_id += tool_call.call_id
-                tool_call_buf.arguments += (
-                    tool_call.arguments if isinstance(tool_call.arguments, str) else json.dumps(tool_call.arguments)
-                )
-        if choice.finish_reason:
-            chunks = _process_vllm_chat_completion_end_of_stream(
-                finish_reason=choice.finish_reason,
-                last_chunk_content=choice.delta.content,
-                current_event_type=event_type,
-                tool_call_bufs=tool_call_bufs,
-            )
-            for c in chunks:
-                yield c
-            end_of_stream_processed = True
-        elif not choice.delta.tool_calls:
-            yield ChatCompletionResponseStreamChunk(
-                event=ChatCompletionResponseEvent(
-                    event_type=event_type,
-                    delta=TextDelta(text=choice.delta.content or ""),
-                    logprobs=None,
-                )
-            )
-            event_type = ChatCompletionResponseEventType.progress
-
-    if end_of_stream_processed:
-        return
-
-    # the stream ended without a chunk containing finish_reason - we have to generate the
-    # respective completion chunks manually
-    chunks = _process_vllm_chat_completion_end_of_stream(
-        finish_reason=None, last_chunk_content=None, current_event_type=event_type, tool_call_bufs=tool_call_bufs
-    )
-    for c in chunks:
-        yield c
-
-
-class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsProtocolPrivate):
-    # automatically set by the resolver when instantiating the provider
-    __provider_id__: str
-    model_store: ModelStore | None = None
-
-    def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
-        LiteLLMOpenAIMixin.__init__(
-            self,
-            model_entries=build_hf_repo_model_entries(),
-            litellm_provider_name="vllm",
-            api_key_from_config=config.api_token,
-            provider_data_api_key_field="vllm_api_token",
-            openai_compat_api_base=config.url,
-        )
-        self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
-        self.config = config
-
-    get_api_key = LiteLLMOpenAIMixin.get_api_key
+    def get_api_key(self) -> str:
+        return self.config.api_token or ""

    def get_base_url(self) -> str:
        """Get the base URL from config."""
@ -282,27 +57,6 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro
        # Strictly respecting the refresh_models directive
        return self.config.refresh_models

-    async def list_models(self) -> list[Model] | None:
-        models = []
-        async for m in self.client.models.list():
-            model_type = ModelType.llm  # unclear how to determine embedding vs. llm models
-            models.append(
-                Model(
-                    identifier=m.id,
-                    provider_resource_id=m.id,
-                    provider_id=self.__provider_id__,
-                    metadata={},
-                    model_type=model_type,
-                )
-            )
-        return models
-
-    async def shutdown(self) -> None:
-        pass
-
-    async def unregister_model(self, model_id: str) -> None:
-        pass
-
    async def health(self) -> HealthResponse:
        """
        Performs a health check by verifying connectivity to the remote vLLM server.
@ -324,63 +78,9 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro
        except Exception as e:
            return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")

-    async def _get_model(self, model_id: str) -> Model:
-        if not self.model_store:
-            raise ValueError("Model store not set")
-        return await self.model_store.get_model(model_id)
-
    def get_extra_client_params(self):
        return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)}

-    async def register_model(self, model: Model) -> Model:
-        try:
-            model = await self.register_helper.register_model(model)
-        except ValueError:
-            pass  # Ignore statically unknown model, will check live listing
-        try:
-            res = self.client.models.list()
-        except APIConnectionError as e:
-            raise ValueError(
-                f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
-            ) from e
-        available_models = [m.id async for m in res]
-        if model.provider_resource_id not in available_models:
-            raise ValueError(
-                f"Model {model.provider_resource_id} is not being served by vLLM. "
-                f"Available models: {', '.join(available_models)}"
-            )
-        return model
-
-    async def _get_params(self, request: ChatCompletionRequest) -> dict:
-        options = get_sampling_options(request.sampling_params)
-        if "max_tokens" not in options:
-            options["max_tokens"] = self.config.max_tokens
-
-        input_dict: dict[str, Any] = {}
-        # Only include the 'tools' param if there is any. It can break things if an empty list is sent to the vLLM.
-        if isinstance(request, ChatCompletionRequest) and request.tools:
-            input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}
-
-        input_dict["messages"] = [await convert_message_to_openai_dict(m, download=True) for m in request.messages]
-
-        if fmt := request.response_format:
-            if isinstance(fmt, JsonSchemaResponseFormat):
-                input_dict["extra_body"] = {"guided_json": fmt.json_schema}
-            elif isinstance(fmt, GrammarResponseFormat):
-                raise NotImplementedError("Grammar response format not supported yet")
-            else:
-                raise ValueError(f"Unknown response format {fmt.type}")
-
-        if request.logprobs and request.logprobs.top_k:
-            input_dict["logprobs"] = request.logprobs.top_k
-
-        return {
-            "model": request.model,
-            **input_dict,
-            "stream": request.stream,
-            **options,
-        }
-
    async def openai_chat_completion(
        self,
        model: str,
--- a/llama_stack/providers/utils/inference/openai_mixin.py
+++ b/llama_stack/providers/utils/inference/openai_mixin.py
@ -7,10 +7,11 @@
 import base64
 import uuid
 from abc import ABC, abstractmethod
-from collections.abc import AsyncIterator
+from collections.abc import AsyncIterator, Iterable
 from typing import Any

 from openai import NOT_GIVEN, AsyncOpenAI
+from pydantic import BaseModel, ConfigDict

 from llama_stack.apis.inference import (
    Model,
@ -26,14 +27,14 @@ from llama_stack.apis.inference import (
 from llama_stack.apis.models import ModelType
 from llama_stack.core.request_headers import NeedsRequestProviderData
 from llama_stack.log import get_logger
-from llama_stack.providers.datatypes import ModelsProtocolPrivate
+from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
 from llama_stack.providers.utils.inference.prompt_adapter import localize_image_content

 logger = get_logger(name=__name__, category="providers::utils")


-class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
+class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
    """
    Mixin class that provides OpenAI-specific functionality for inference providers.
    This class handles direct OpenAI API calls using the AsyncOpenAI client.
@ -42,12 +43,25 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
    - get_api_key(): Method to retrieve the API key
    - get_base_url(): Method to retrieve the OpenAI-compatible API base URL

+    The behavior of this class can be customized by child classes in the following ways:
+    - overwrite_completion_id: If True, overwrites the 'id' field in OpenAI responses
+    - download_images: If True, downloads images and converts to base64 for providers that require it
+    - embedding_model_metadata: A dictionary mapping model IDs to their embedding metadata
+    - provider_data_api_key_field: Optional field name in provider data to look for API key
+    - list_provider_model_ids: Method to list available models from the provider
+    - get_extra_client_params: Method to provide extra parameters to the AsyncOpenAI client
+
    Expected Dependencies:
    - self.model_store: Injected by the Llama Stack distribution system at runtime.
      This provides model registry functionality for looking up registered models.
      The model_store is set in routing_tables/common.py during provider initialization.
    """

+    # Allow extra fields so the routing infra can inject model_store, __provider_id__, etc.
+    model_config = ConfigDict(extra="allow")
+
+    config: RemoteInferenceProviderConfig
+
    # Allow subclasses to control whether to overwrite the 'id' field in OpenAI responses
    # is overwritten with a client-side generated id.
    #
@ -73,9 +87,6 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
    # Optional field name in provider data to look for API key, which takes precedence
    provider_data_api_key_field: str | None = None

-    # automatically set by the resolver when instantiating the provider
-    __provider_id__: str
-
    @abstractmethod
    def get_api_key(self) -> str:
        """
@ -111,6 +122,38 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
        """
        return {}

+    async def list_provider_model_ids(self) -> Iterable[str]:
+        """
+        List available models from the provider.
+
+        Child classes can override this method to provide a custom implementation
+        for listing models. The default implementation uses the AsyncOpenAI client
+        to list models from the OpenAI-compatible endpoint.
+
+        :return: An iterable of model IDs or None if not implemented
+        """
+        return [m.id async for m in self.client.models.list()]
+
+    async def initialize(self) -> None:
+        """
+        Initialize the OpenAI mixin.
+
+        This method provides a default implementation that does nothing.
+        Subclasses can override this method to perform initialization tasks
+        such as setting up clients, validating configurations, etc.
+        """
+        pass
+
+    async def shutdown(self) -> None:
+        """
+        Shutdown the OpenAI mixin.
+
+        This method provides a default implementation that does nothing.
+        Subclasses can override this method to perform cleanup tasks
+        such as closing connections, releasing resources, etc.
+        """
+        pass
+
    @property
    def client(self) -> AsyncOpenAI:
        """
@ -371,7 +414,7 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):

    async def register_model(self, model: Model) -> Model:
        if not await self.check_model_availability(model.provider_model_id):
-            raise ValueError(f"Model {model.provider_model_id} is not available from provider {self.__provider_id__}")
+            raise ValueError(f"Model {model.provider_model_id} is not available from provider {self.__provider_id__}")  # type: ignore[attr-defined]
        return model

    async def unregister_model(self, model_id: str) -> None:
@ -387,28 +430,42 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
        """
        self._model_cache = {}

-        async for m in self.client.models.list():
-            if self.allowed_models and m.id not in self.allowed_models:
-                logger.info(f"Skipping model {m.id} as it is not in the allowed models list")
+        try:
+            iterable = await self.list_provider_model_ids()
+        except Exception as e:
+            logger.error(f"{self.__class__.__name__}.list_provider_model_ids() failed with: {e}")
+            raise
+        if not hasattr(iterable, "__iter__"):
+            raise TypeError(
+                f"Failed to list models: {self.__class__.__name__}.list_provider_model_ids() must return an iterable of "
+                f"strings, but returned {type(iterable).__name__}"
+            )
+
+        provider_models_ids = list(iterable)
+        logger.info(f"{self.__class__.__name__}.list_provider_model_ids() returned {len(provider_models_ids)} models")
+
+        for provider_model_id in provider_models_ids:
+            if not isinstance(provider_model_id, str):
+                raise ValueError(f"Model ID {provider_model_id} from list_provider_model_ids() is not a string")
+            if self.allowed_models and provider_model_id not in self.allowed_models:
+                logger.info(f"Skipping model {provider_model_id} as it is not in the allowed models list")
                continue
-            if metadata := self.embedding_model_metadata.get(m.id):
-                # This is an embedding model - augment with metadata
+            if metadata := self.embedding_model_metadata.get(provider_model_id):
                model = Model(
                    provider_id=self.__provider_id__,  # type: ignore[attr-defined]
-                    provider_resource_id=m.id,
-                    identifier=m.id,
+                    provider_resource_id=provider_model_id,
+                    identifier=provider_model_id,
                    model_type=ModelType.embedding,
                    metadata=metadata,
                )
            else:
-                # This is an LLM
                model = Model(
                    provider_id=self.__provider_id__,  # type: ignore[attr-defined]
-                    provider_resource_id=m.id,
-                    identifier=m.id,
+                    provider_resource_id=provider_model_id,
+                    identifier=provider_model_id,
                    model_type=ModelType.llm,
                )
-            self._model_cache[m.id] = model
+            self._model_cache[provider_model_id] = model

        return list(self._model_cache.values())

@ -425,3 +482,29 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):

    async def should_refresh_models(self) -> bool:
        return False
+
+    #
+    # The model_dump implementations are to avoid serializing the extra fields,
+    # e.g. model_store, which are not pydantic.
+    #
+
+    def _filter_fields(self, **kwargs):
+        """Helper to exclude extra fields from serialization."""
+        # Exclude any extra fields stored in __pydantic_extra__
+        if hasattr(self, "__pydantic_extra__") and self.__pydantic_extra__:
+            exclude = kwargs.get("exclude", set())
+            if not isinstance(exclude, set):
+                exclude = set(exclude) if exclude else set()
+            exclude.update(self.__pydantic_extra__.keys())
+            kwargs["exclude"] = exclude
+        return kwargs
+
+    def model_dump(self, **kwargs):
+        """Override to exclude extra fields from serialization."""
+        kwargs = self._filter_fields(**kwargs)
+        return super().model_dump(**kwargs)
+
+    def model_dump_json(self, **kwargs):
+        """Override to exclude extra fields from JSON serialization."""
+        kwargs = self._filter_fields(**kwargs)
+        return super().model_dump_json(**kwargs)
--- a/llama_stack/schema_utils.py
+++ b/llama_stack/schema_utils.py
@ -11,6 +11,43 @@ from typing import Any, TypeVar
 from .strong_typing.schema import json_schema_type, register_schema  # noqa: F401


+class ExtraBodyField[T]:
+    """
+    Marker annotation for parameters that arrive via extra_body in the client SDK.
+
+    These parameters:
+    - Will NOT appear in the generated client SDK method signature
+    - WILL be documented in OpenAPI spec under x-llama-stack-extra-body-params
+    - MUST be passed via the extra_body parameter in client SDK calls
+    - WILL be available in server-side method signature with proper typing
+
+    Example:
+        ```python
+        async def create_openai_response(
+            self,
+            input: str,
+            model: str,
+            shields: Annotated[
+                list[str] | None, ExtraBodyField("List of shields to apply")
+            ] = None,
+        ) -> ResponseObject:
+            # shields is available here with proper typing
+            if shields:
+                print(f"Using shields: {shields}")
+        ```
+
+        Client usage:
+        ```python
+        client.responses.create(
+            input="hello", model="llama-3", extra_body={"shields": ["shield-1"]}
+        )
+        ```
+    """
+
+    def __init__(self, description: str | None = None):
+        self.description = description
+
+
@dataclass
 class WebMethod:
    level: str | None = None
@ -26,7 +63,7 @@ class WebMethod:
    deprecated: bool | None = False


-T = TypeVar("T", bound=Callable[..., Any])
+CallableT = TypeVar("CallableT", bound=Callable[..., Any])


 def webmethod(
@ -40,7 +77,7 @@ def webmethod(
    descriptive_name: str | None = None,
    required_scope: str | None = None,
    deprecated: bool | None = False,
-) -> Callable[[T], T]:
+) -> Callable[[CallableT], CallableT]:
    """
    Decorator that supplies additional metadata to an endpoint operation function.

@ -51,7 +88,7 @@ def webmethod(
    :param required_scope: Required scope for this endpoint (e.g., 'monitoring.viewer').
    """

-    def wrap(func: T) -> T:
+    def wrap(func: CallableT) -> CallableT:
        webmethod_obj = WebMethod(
            route=route,
            method=method,
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -22,10 +22,18 @@ from llama_stack.log import get_logger
 logger = get_logger(__name__, category="testing")

 # Global state for the recording system
+# Note: Using module globals instead of ContextVars because the session-scoped
+# client initialization happens in one async context, but tests run in different
+# contexts, and we need the mode/storage to persist across all contexts.
 _current_mode: str | None = None
 _current_storage: ResponseStorage | None = None
 _original_methods: dict[str, Any] = {}

+# Test context uses ContextVar since it changes per-test and needs async isolation
+from contextvars import ContextVar
+
+_test_context: ContextVar[str | None] = ContextVar("_test_context", default=None)
+
 from openai.types.completion_choice import CompletionChoice

 # update the "finish_reason" field, since its type definition is wrong (no None is accepted)
@ -33,22 +41,38 @@ CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "len
 CompletionChoice.model_rebuild()

 REPO_ROOT = Path(__file__).parent.parent.parent
-DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/recordings"
+DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/common"


 class InferenceMode(StrEnum):
    LIVE = "live"
    RECORD = "record"
    REPLAY = "replay"
+    RECORD_IF_MISSING = "record-if-missing"


 def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict[str, Any]) -> str:
-    """Create a normalized hash of the request for consistent matching."""
+    """Create a normalized hash of the request for consistent matching.
+
+    Includes test_id from context to ensure test isolation - identical requests
+    from different tests will have different hashes.
+
+    Exception: Model list endpoints (/v1/models, /api/tags) exclude test_id since
+    they are infrastructure/shared and need to work across session setup and tests.
+    """
    # Extract just the endpoint path
    from urllib.parse import urlparse

    parsed = urlparse(url)
-    normalized = {"method": method.upper(), "endpoint": parsed.path, "body": body}
+    normalized: dict[str, Any] = {
+        "method": method.upper(),
+        "endpoint": parsed.path,
+        "body": body,
+    }
+
+    # Include test_id for isolation, except for shared infrastructure endpoints
+    if parsed.path not in ("/api/tags", "/v1/models"):
+        normalized["test_id"] = _test_context.get()

    # Create hash - sort_keys=True ensures deterministic ordering
    normalized_json = json.dumps(normalized, sort_keys=True)
@ -67,7 +91,11 @@ def setup_inference_recording():
    Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases.

    Two environment variables are supported:
-    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. Default is 'replay'.
+    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', 'replay', or 'record-if-missing'. Default is 'replay'.
+      - 'live': Make all requests live without recording
+      - 'record': Record all requests (overwrites existing recordings)
+      - 'replay': Use only recorded responses (fails if recording not found)
+      - 'record-if-missing': Use recorded responses when available, record new ones when not found
    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.

    The recordings are stored as JSON files.
@ -80,9 +108,43 @@ def setup_inference_recording():
    return inference_recording(mode=mode, storage_dir=storage_dir)


-def _serialize_response(response: Any) -> Any:
+def _normalize_response_data(data: dict[str, Any], request_hash: str) -> dict[str, Any]:
+    """Normalize fields that change between recordings but don't affect functionality.
+
+    This reduces noise in git diffs by making IDs deterministic and timestamps constant.
+    """
+    # Only normalize ID for completion/chat responses, not for model objects
+    # Model objects have "object": "model" and the ID is the actual model identifier
+    if "id" in data and data.get("object") != "model":
+        data["id"] = f"rec-{request_hash[:12]}"
+
+    # Normalize timestamp to epoch (0) (for OpenAI-style responses)
+    # But not for model objects where created timestamp might be meaningful
+    if "created" in data and data.get("object") != "model":
+        data["created"] = 0
+
+    # Normalize Ollama-specific timestamp fields
+    if "created_at" in data:
+        data["created_at"] = "1970-01-01T00:00:00.000000Z"
+
+    # Normalize Ollama-specific duration fields (these vary based on system load)
+    if "total_duration" in data and data["total_duration"] is not None:
+        data["total_duration"] = 0
+    if "load_duration" in data and data["load_duration"] is not None:
+        data["load_duration"] = 0
+    if "prompt_eval_duration" in data and data["prompt_eval_duration"] is not None:
+        data["prompt_eval_duration"] = 0
+    if "eval_duration" in data and data["eval_duration"] is not None:
+        data["eval_duration"] = 0
+
+    return data
+
+
+def _serialize_response(response: Any, request_hash: str = "") -> Any:
    if hasattr(response, "model_dump"):
        data = response.model_dump(mode="json")
+        # Normalize fields to reduce noise
+        data = _normalize_response_data(data, request_hash)
        return {
            "__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
            "__data__": data,
@ -120,61 +182,121 @@ def _deserialize_response(data: dict[str, Any]) -> Any:
 class ResponseStorage:
    """Handles SQLite index + JSON file storage/retrieval for inference recordings."""

-    def __init__(self, test_dir: Path):
-        self.test_dir = test_dir
-        self.responses_dir = self.test_dir / "responses"
+    def __init__(self, base_dir: Path):
+        self.base_dir = base_dir
+        # Don't create responses_dir here - determine it per-test at runtime

-        self._ensure_directories()
+    def _get_test_dir(self) -> Path:
+        """Get the recordings directory in the test file's parent directory.
+
+        For test at "tests/integration/inference/test_foo.py::test_bar",
+        returns "tests/integration/inference/recordings/".
+        """
+        test_id = _test_context.get()
+        if test_id:
+            # Extract the directory path from the test nodeid
+            # e.g., "tests/integration/inference/test_basic.py::test_foo[params]"
+            # -> get "tests/integration/inference"
+            test_file = test_id.split("::")[0]  # Remove test function part
+            test_dir = Path(test_file).parent  # Get parent directory
+
+            # Put recordings in a "recordings" subdirectory of the test's parent dir
+            # e.g., "tests/integration/inference" -> "tests/integration/inference/recordings"
+            return test_dir / "recordings"
+        else:
+            # Fallback for non-test contexts
+            return self.base_dir / "recordings"

    def _ensure_directories(self):
-        self.test_dir.mkdir(parents=True, exist_ok=True)
-        self.responses_dir.mkdir(exist_ok=True)
+        """Ensure test-specific directories exist."""
+        test_dir = self._get_test_dir()
+        test_dir.mkdir(parents=True, exist_ok=True)
+        return test_dir

    def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
        """Store a request/response pair."""
-        # Generate unique response filename
-        short_hash = request_hash[:12]
-        response_file = f"{short_hash}.json"
+        responses_dir = self._ensure_directories()
+
+        # Use FULL hash (not truncated)
+        response_file = f"{request_hash}.json"

        # Serialize response body if needed
        serialized_response = dict(response)
        if "body" in serialized_response:
            if isinstance(serialized_response["body"], list):
                # Handle streaming responses (list of chunks)
-                serialized_response["body"] = [_serialize_response(chunk) for chunk in serialized_response["body"]]
+                serialized_response["body"] = [
+                    _serialize_response(chunk, request_hash) for chunk in serialized_response["body"]
+                ]
            else:
                # Handle single response
-                serialized_response["body"] = _serialize_response(serialized_response["body"])
+                serialized_response["body"] = _serialize_response(serialized_response["body"], request_hash)

-        # If this is an Ollama /api/tags recording, include models digest in filename to distinguish variants
+        # For model-list endpoints, include digest in filename to distinguish different model sets
        endpoint = request.get("endpoint")
        if endpoint in ("/api/tags", "/v1/models"):
            digest = _model_identifiers_digest(endpoint, response)
-            response_file = f"models-{short_hash}-{digest}.json"
+            response_file = f"models-{request_hash}-{digest}.json"

-        response_path = self.responses_dir / response_file
+        response_path = responses_dir / response_file

-        # Save response to JSON file
+        # Save response to JSON file with metadata
        with open(response_path, "w") as f:
-            json.dump({"request": request, "response": serialized_response}, f, indent=2)
+            json.dump(
+                {
+                    "test_id": _test_context.get(),  # Include for debugging
+                    "request": request,
+                    "response": serialized_response,
+                },
+                f,
+                indent=2,
+            )
            f.write("\n")
            f.flush()

    def find_recording(self, request_hash: str) -> dict[str, Any] | None:
-        """Find a recorded response by request hash."""
-        response_file = f"{request_hash[:12]}.json"
-        response_path = self.responses_dir / response_file
+        """Find a recorded response by request hash.

-        if not response_path.exists():
-            return None
+        Uses fallback: first checks test-specific dir, then falls back to base recordings dir.
+        This handles cases where recordings happen during session setup (no test context) but
+        are requested during tests (with test context).
+        """
+        response_file = f"{request_hash}.json"

-        return _recording_from_file(response_path)
+        # Try test-specific directory first
+        test_dir = self._get_test_dir()
+        response_path = test_dir / response_file

-    def _model_list_responses(self, short_hash: str) -> list[dict[str, Any]]:
+        if response_path.exists():
+            return _recording_from_file(response_path)
+
+        # Fallback to base recordings directory (for session-level recordings)
+        fallback_dir = self.base_dir / "recordings"
+        fallback_path = fallback_dir / response_file
+
+        if fallback_path.exists():
+            return _recording_from_file(fallback_path)
+
+        return None
+
+    def _model_list_responses(self, request_hash: str) -> list[dict[str, Any]]:
+        """Find all model-list recordings with the given hash (different digests)."""
        results: list[dict[str, Any]] = []
-        for path in self.responses_dir.glob(f"models-{short_hash}-*.json"):
-            data = _recording_from_file(path)
-            results.append(data)
+
+        # Check test-specific directory first
+        test_dir = self._get_test_dir()
+        if test_dir.exists():
+            for path in test_dir.glob(f"models-{request_hash}-*.json"):
+                data = _recording_from_file(path)
+                results.append(data)
+
+        # Also check fallback directory
+        fallback_dir = self.base_dir / "recordings"
+        if fallback_dir.exists():
+            for path in fallback_dir.glob(f"models-{request_hash}-*.json"):
+                data = _recording_from_file(path)
+                results.append(data)
+
        return results


@ -195,6 +317,8 @@ def _recording_from_file(response_path) -> dict[str, Any]:


 def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
+    """Generate a digest from model identifiers for distinguishing different model sets."""
+
    def _extract_model_identifiers():
        """Extract a stable set of identifiers for model-list endpoints.

@ -217,7 +341,14 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:


 def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
-    """Return a single, unioned recording for supported model-list endpoints."""
+    """Return a single, unioned recording for supported model-list endpoints.
+
+    Merges multiple recordings with different model sets (from different servers) into
+    a single response containing all models.
+    """
+    if not records:
+        return None
+
    seen: dict[str, dict[str, Any]] = {}
    for rec in records:
        body = rec["response"]["body"]
@ -246,7 +377,10 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
 async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
    global _current_mode, _current_storage

-    if _current_mode == InferenceMode.LIVE or _current_storage is None:
+    mode = _current_mode
+    storage = _current_storage
+
+    if mode == InferenceMode.LIVE or storage is None:
        if endpoint == "/v1/models":
            return original_method(self, *args, **kwargs)
        else:
@ -277,13 +411,16 @@ async def _patched_inference_method(original_method, self, client_type, endpoint

    request_hash = normalize_request(method, url, headers, body)

-    if _current_mode == InferenceMode.REPLAY:
-        # Special handling for model-list endpoints: return union of all responses
+    # Try to find existing recording for REPLAY or RECORD_IF_MISSING modes
+    recording = None
+    if mode == InferenceMode.REPLAY or mode == InferenceMode.RECORD_IF_MISSING:
+        # Special handling for model-list endpoints: merge all recordings with this hash
        if endpoint in ("/api/tags", "/v1/models"):
-            records = _current_storage._model_list_responses(request_hash[:12])
+            records = storage._model_list_responses(request_hash)
            recording = _combine_model_list_responses(endpoint, records)
        else:
-            recording = _current_storage.find_recording(request_hash)
+            recording = storage.find_recording(request_hash)
+
        if recording:
            response_body = recording["response"]["body"]

@ -296,7 +433,8 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
                return replay_stream()
            else:
                return response_body
-        else:
+        elif mode == InferenceMode.REPLAY:
+            # REPLAY mode requires recording to exist
            raise RuntimeError(
                f"No recorded response found for request hash: {request_hash}\n"
                f"Request: {method} {url} {body}\n"
@ -304,7 +442,7 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
                f"To record this response, run with LLAMA_STACK_TEST_INFERENCE_MODE=record"
            )

-    elif _current_mode == InferenceMode.RECORD:
+    if mode == InferenceMode.RECORD or (mode == InferenceMode.RECORD_IF_MISSING and not recording):
        if endpoint == "/v1/models":
            response = original_method(self, *args, **kwargs)
        else:
@ -335,7 +473,7 @@ async def _patched_inference_method(original_method, self, client_type, endpoint

            # Store the recording immediately
            response_data = {"body": chunks, "is_streaming": True}
-            _current_storage.store_recording(request_hash, request_data, response_data)
+            storage.store_recording(request_hash, request_data, response_data)

            # Return a generator that replays the stored chunks
            async def replay_recorded_stream():
@ -345,11 +483,11 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
            return replay_recorded_stream()
        else:
            response_data = {"body": response, "is_streaming": False}
-            _current_storage.store_recording(request_hash, request_data, response_data)
+            storage.store_recording(request_hash, request_data, response_data)
            return response

    else:
-        raise AssertionError(f"Invalid mode: {_current_mode}")
+        raise AssertionError(f"Invalid mode: {mode}")


 def patch_inference_clients():
@ -490,9 +628,9 @@ def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Gen
    try:
        _current_mode = mode

-        if mode in ["record", "replay"]:
+        if mode in ["record", "replay", "record-if-missing"]:
            if storage_dir is None:
-                raise ValueError("storage_dir is required for record and replay modes")
+                raise ValueError("storage_dir is required for record, replay, and record-if-missing modes")
            _current_storage = ResponseStorage(Path(storage_dir))
            patch_inference_clients()

@ -500,7 +638,7 @@ def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Gen

    finally:
        # Restore previous state
-        if mode in ["record", "replay"]:
+        if mode in ["record", "replay", "record-if-missing"]:
            unpatch_inference_clients()

        _current_mode = prev_mode
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -20,11 +20,11 @@
        "framer-motion": "^12.23.12",
        "llama-stack-client": "^0.2.23",
        "lucide-react": "^0.542.0",
-        "next": "15.5.3",
+        "next": "15.5.4",
        "next-auth": "^4.24.11",
        "next-themes": "^0.4.6",
        "react": "^19.0.0",
-        "react-dom": "^19.1.1",
+        "react-dom": "^19.2.0",
        "react-markdown": "^10.1.0",
        "remark-gfm": "^4.0.1",
        "remeda": "^2.32.0",
@ -2279,9 +2279,9 @@
      }
    },
    "node_modules/@next/env": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.5.3.tgz",
-      "integrity": "sha512-RSEDTRqyihYXygx/OJXwvVupfr9m04+0vH8vyy0HfZ7keRto6VX9BbEk0J2PUk0VGy6YhklJUSrgForov5F9pw==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/@next/env/-/env-15.5.4.tgz",
+      "integrity": "sha512-27SQhYp5QryzIT5uO8hq99C69eLQ7qkzkDPsk3N+GuS2XgOgoYEeOav7Pf8Tn4drECOVDsDg8oj+/DVy8qQL2A==",
      "license": "MIT"
    },
    "node_modules/@next/eslint-plugin-next": {
@ -2295,9 +2295,9 @@
      }
    },
    "node_modules/@next/swc-darwin-arm64": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.5.3.tgz",
-      "integrity": "sha512-nzbHQo69+au9wJkGKTU9lP7PXv0d1J5ljFpvb+LnEomLtSbJkbZyEs6sbF3plQmiOB2l9OBtN2tNSvCH1nQ9Jg==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.5.4.tgz",
+      "integrity": "sha512-nopqz+Ov6uvorej8ndRX6HlxCYWCO3AHLfKK2TYvxoSB2scETOcfm/HSS3piPqc3A+MUgyHoqE6je4wnkjfrOA==",
      "cpu": [
        "arm64"
      ],
@ -2311,9 +2311,9 @@
      }
    },
    "node_modules/@next/swc-darwin-x64": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.5.3.tgz",
-      "integrity": "sha512-w83w4SkOOhekJOcA5HBvHyGzgV1W/XvOfpkrxIse4uPWhYTTRwtGEM4v/jiXwNSJvfRvah0H8/uTLBKRXlef8g==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.5.4.tgz",
+      "integrity": "sha512-QOTCFq8b09ghfjRJKfb68kU9k2K+2wsC4A67psOiMn849K9ZXgCSRQr0oVHfmKnoqCbEmQWG1f2h1T2vtJJ9mA==",
      "cpu": [
        "x64"
      ],
@ -2327,9 +2327,9 @@
      }
    },
    "node_modules/@next/swc-linux-arm64-gnu": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.5.3.tgz",
-      "integrity": "sha512-+m7pfIs0/yvgVu26ieaKrifV8C8yiLe7jVp9SpcIzg7XmyyNE7toC1fy5IOQozmr6kWl/JONC51osih2RyoXRw==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.5.4.tgz",
+      "integrity": "sha512-eRD5zkts6jS3VfE/J0Kt1VxdFqTnMc3QgO5lFE5GKN3KDI/uUpSyK3CjQHmfEkYR4wCOl0R0XrsjpxfWEA++XA==",
      "cpu": [
        "arm64"
      ],
@ -2343,9 +2343,9 @@
      }
    },
    "node_modules/@next/swc-linux-arm64-musl": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.5.3.tgz",
-      "integrity": "sha512-u3PEIzuguSenoZviZJahNLgCexGFhso5mxWCrrIMdvpZn6lkME5vc/ADZG8UUk5K1uWRy4hqSFECrON6UKQBbQ==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.5.4.tgz",
+      "integrity": "sha512-TOK7iTxmXFc45UrtKqWdZ1shfxuL4tnVAOuuJK4S88rX3oyVV4ZkLjtMT85wQkfBrOOvU55aLty+MV8xmcJR8A==",
      "cpu": [
        "arm64"
      ],
@ -2359,9 +2359,9 @@
      }
    },
    "node_modules/@next/swc-linux-x64-gnu": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.5.3.tgz",
-      "integrity": "sha512-lDtOOScYDZxI2BENN9m0pfVPJDSuUkAD1YXSvlJF0DKwZt0WlA7T7o3wrcEr4Q+iHYGzEaVuZcsIbCps4K27sA==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.5.4.tgz",
+      "integrity": "sha512-7HKolaj+481FSW/5lL0BcTkA4Ueam9SPYWyN/ib/WGAFZf0DGAN8frNpNZYFHtM4ZstrHZS3LY3vrwlIQfsiMA==",
      "cpu": [
        "x64"
      ],
@ -2375,9 +2375,9 @@
      }
    },
    "node_modules/@next/swc-linux-x64-musl": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.5.3.tgz",
-      "integrity": "sha512-9vWVUnsx9PrY2NwdVRJ4dUURAQ8Su0sLRPqcCCxtX5zIQUBES12eRVHq6b70bbfaVaxIDGJN2afHui0eDm+cLg==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.5.4.tgz",
+      "integrity": "sha512-nlQQ6nfgN0nCO/KuyEUwwOdwQIGjOs4WNMjEUtpIQJPR2NUfmGpW2wkJln1d4nJ7oUzd1g4GivH5GoEPBgfsdw==",
      "cpu": [
        "x64"
      ],
@ -2391,9 +2391,9 @@
      }
    },
    "node_modules/@next/swc-win32-arm64-msvc": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.5.3.tgz",
-      "integrity": "sha512-1CU20FZzY9LFQigRi6jM45oJMU3KziA5/sSG+dXeVaTm661snQP6xu3ykGxxwU5sLG3sh14teO/IOEPVsQMRfA==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.5.4.tgz",
+      "integrity": "sha512-PcR2bN7FlM32XM6eumklmyWLLbu2vs+D7nJX8OAIoWy69Kef8mfiN4e8TUv2KohprwifdpFKPzIP1njuCjD0YA==",
      "cpu": [
        "arm64"
      ],
@ -2407,9 +2407,9 @@
      }
    },
    "node_modules/@next/swc-win32-x64-msvc": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.5.3.tgz",
-      "integrity": "sha512-JMoLAq3n3y5tKXPQwCK5c+6tmwkuFDa2XAxz8Wm4+IVthdBZdZGh+lmiLUHg9f9IDwIQpUjp+ysd6OkYTyZRZw==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.5.4.tgz",
+      "integrity": "sha512-1ur2tSHZj8Px/KMAthmuI9FMp/YFusMMGoRNJaRZMOlSkgvLjzosSdQI0cJAKogdHl3qXUQKL9MGaYvKwA7DXg==",
      "cpu": [
        "x64"
      ],
@ -3995,22 +3995,22 @@
      }
    },
    "node_modules/@types/react": {
-      "version": "19.1.4",
-      "resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.4.tgz",
-      "integrity": "sha512-EB1yiiYdvySuIITtD5lhW4yPyJ31RkJkkDw794LaQYrxCSaQV/47y5o1FMC4zF9ZyjUjzJMZwbovEnT5yHTW6g==",
+      "version": "19.2.0",
+      "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.0.tgz",
+      "integrity": "sha512-1LOH8xovvsKsCBq1wnT4ntDUdCJKmnEakhsuoUSy6ExlHCkGP2hqnatagYTgFk6oeL0VU31u7SNjunPN+GchtA==",
      "license": "MIT",
      "dependencies": {
        "csstype": "^3.0.2"
      }
    },
    "node_modules/@types/react-dom": {
-      "version": "19.1.9",
-      "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.9.tgz",
-      "integrity": "sha512-qXRuZaOsAdXKFyOhRBg6Lqqc0yay13vN7KrIg4L7N4aaHN68ma9OK3NE1BoDFgFOTfM7zg+3/8+2n8rLUH3OKQ==",
+      "version": "19.2.0",
+      "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.2.0.tgz",
+      "integrity": "sha512-brtBs0MnE9SMx7px208g39lRmC5uHZs96caOJfTjFcYSLHNamvaSMfJNagChVNkup2SdtOxKX1FDBkRSJe1ZAg==",
      "devOptional": true,
      "license": "MIT",
      "peerDependencies": {
-        "@types/react": "^19.0.0"
+        "@types/react": "^19.2.0"
      }
    },
    "node_modules/@types/stack-utils": {
@ -11414,12 +11414,12 @@
      }
    },
    "node_modules/next": {
-      "version": "15.5.3",
-      "resolved": "https://registry.npmjs.org/next/-/next-15.5.3.tgz",
-      "integrity": "sha512-r/liNAx16SQj4D+XH/oI1dlpv9tdKJ6cONYPwwcCC46f2NjpaRWY+EKCzULfgQYV6YKXjHBchff2IZBSlZmJNw==",
+      "version": "15.5.4",
+      "resolved": "https://registry.npmjs.org/next/-/next-15.5.4.tgz",
+      "integrity": "sha512-xH4Yjhb82sFYQfY3vbkJfgSDgXvBB6a8xPs9i35k6oZJRoQRihZH+4s9Yo2qsWpzBmZ3lPXaJ2KPXLfkvW4LnA==",
      "license": "MIT",
      "dependencies": {
-        "@next/env": "15.5.3",
+        "@next/env": "15.5.4",
        "@swc/helpers": "0.5.15",
        "caniuse-lite": "^1.0.30001579",
        "postcss": "8.4.31",
@ -11432,14 +11432,14 @@
        "node": "^18.18.0 || ^19.8.0 || >= 20.0.0"
      },
      "optionalDependencies": {
-        "@next/swc-darwin-arm64": "15.5.3",
-        "@next/swc-darwin-x64": "15.5.3",
-        "@next/swc-linux-arm64-gnu": "15.5.3",
-        "@next/swc-linux-arm64-musl": "15.5.3",
-        "@next/swc-linux-x64-gnu": "15.5.3",
-        "@next/swc-linux-x64-musl": "15.5.3",
-        "@next/swc-win32-arm64-msvc": "15.5.3",
-        "@next/swc-win32-x64-msvc": "15.5.3",
+        "@next/swc-darwin-arm64": "15.5.4",
+        "@next/swc-darwin-x64": "15.5.4",
+        "@next/swc-linux-arm64-gnu": "15.5.4",
+        "@next/swc-linux-arm64-musl": "15.5.4",
+        "@next/swc-linux-x64-gnu": "15.5.4",
+        "@next/swc-linux-x64-musl": "15.5.4",
+        "@next/swc-win32-arm64-msvc": "15.5.4",
+        "@next/swc-win32-x64-msvc": "15.5.4",
        "sharp": "^0.34.3"
      },
      "peerDependencies": {
@ -12450,24 +12450,24 @@
      }
    },
    "node_modules/react": {
-      "version": "19.1.1",
-      "resolved": "https://registry.npmjs.org/react/-/react-19.1.1.tgz",
-      "integrity": "sha512-w8nqGImo45dmMIfljjMwOGtbmC/mk4CMYhWIicdSflH91J9TyCyczcPFXJzrZ/ZXcgGRFeP6BU0BEJTw6tZdfQ==",
+      "version": "19.2.0",
+      "resolved": "https://registry.npmjs.org/react/-/react-19.2.0.tgz",
+      "integrity": "sha512-tmbWg6W31tQLeB5cdIBOicJDJRR2KzXsV7uSK9iNfLWQ5bIZfxuPEHp7M8wiHyHnn0DD1i7w3Zmin0FtkrwoCQ==",
      "license": "MIT",
      "engines": {
        "node": ">=0.10.0"
      }
    },
    "node_modules/react-dom": {
-      "version": "19.1.1",
-      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.1.tgz",
-      "integrity": "sha512-Dlq/5LAZgF0Gaz6yiqZCf6VCcZs1ghAJyrsu84Q/GT0gV+mCxbfmKNoGRKBYMJ8IEdGPqu49YWXD02GCknEDkw==",
+      "version": "19.2.0",
+      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.0.tgz",
+      "integrity": "sha512-UlbRu4cAiGaIewkPyiRGJk0imDN2T3JjieT6spoL2UeSf5od4n5LB/mQ4ejmxhCFT1tYe8IvaFulzynWovsEFQ==",
      "license": "MIT",
      "dependencies": {
-        "scheduler": "^0.26.0"
+        "scheduler": "^0.27.0"
      },
      "peerDependencies": {
-        "react": "^19.1.1"
+        "react": "^19.2.0"
      }
    },
    "node_modules/react-is": {
@ -12982,9 +12982,9 @@
      }
    },
    "node_modules/scheduler": {
-      "version": "0.26.0",
-      "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz",
-      "integrity": "sha512-NlHwttCI/l5gCPR3D1nNXtWABUmBwvZpEQiD4IXSbIDq8BzLIK/7Ir5gTFSGZDUu37K5cMNp0hFtzO38sC7gWA==",
+      "version": "0.27.0",
+      "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.27.0.tgz",
+      "integrity": "sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q==",
      "license": "MIT"
    },
    "node_modules/semver": {
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -25,11 +25,11 @@
    "framer-motion": "^12.23.12",
    "llama-stack-client": "^0.2.23",
    "lucide-react": "^0.542.0",
-    "next": "15.5.3",
+    "next": "15.5.4",
    "next-auth": "^4.24.11",
    "next-themes": "^0.4.6",
    "react": "^19.0.0",
-    "react-dom": "^19.1.1",
+    "react-dom": "^19.2.0",
    "react-markdown": "^10.1.0",
    "remark-gfm": "^4.0.1",
    "remeda": "^2.32.0",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -99,6 +99,7 @@ unit = [
    "coverage",
    "chromadb>=1.0.15",
    "moto[s3]>=5.1.10",
+    "weaviate-client>=4.16.4",
 ]
 # These are the core dependencies required for running integration tests. They are shared across all
 # providers. If a provider requires additional dependencies, please add them to your environment
@ -277,14 +278,10 @@ exclude = [
    "^llama_stack/providers/remote/datasetio/huggingface/",
    "^llama_stack/providers/remote/datasetio/nvidia/",
    "^llama_stack/providers/remote/inference/bedrock/",
-    "^llama_stack/providers/remote/inference/cerebras/",
-    "^llama_stack/providers/remote/inference/databricks/",
-    "^llama_stack/providers/remote/inference/fireworks/",
    "^llama_stack/providers/remote/inference/nvidia/",
    "^llama_stack/providers/remote/inference/passthrough/",
    "^llama_stack/providers/remote/inference/runpod/",
    "^llama_stack/providers/remote/inference/tgi/",
-    "^llama_stack/providers/remote/inference/together/",
    "^llama_stack/providers/remote/inference/watsonx/",
    "^llama_stack/providers/remote/safety/bedrock/",
    "^llama_stack/providers/remote/safety/nvidia/",
--- a/scripts/normalize_recordings.py
+++ b/scripts/normalize_recordings.py
@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Utility script to re-normalize existing recording files.
+
+This script reads all recording JSON files and applies the normalization
+to make IDs deterministic and timestamps constant. This reduces noise in
+git diffs when recordings are re-recorded.
+
+Usage:
+    python scripts/normalize_recordings.py [--dry-run]
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+
+def normalize_response_data(data: dict, request_hash: str) -> dict:
+    """Normalize fields that change between recordings but don't affect functionality."""
+    # Only normalize ID for completion/chat responses, not for model objects
+    # Model objects have "object": "model" and the ID is the actual model identifier
+    if "id" in data and data.get("object") != "model":
+        data["id"] = f"rec-{request_hash[:12]}"
+
+    # Normalize timestamp to epoch (0) (for OpenAI-style responses)
+    # But not for model objects where created timestamp might be meaningful
+    if "created" in data and data.get("object") != "model":
+        data["created"] = 0
+
+    # Normalize Ollama-specific timestamp fields
+    if "created_at" in data:
+        data["created_at"] = "1970-01-01T00:00:00.000000Z"
+
+    # Normalize Ollama-specific duration fields (these vary based on system load)
+    if "total_duration" in data and data["total_duration"] is not None:
+        data["total_duration"] = 0
+    if "load_duration" in data and data["load_duration"] is not None:
+        data["load_duration"] = 0
+    if "prompt_eval_duration" in data and data["prompt_eval_duration"] is not None:
+        data["prompt_eval_duration"] = 0
+    if "eval_duration" in data and data["eval_duration"] is not None:
+        data["eval_duration"] = 0
+
+    return data
+
+
+def normalize_recording_file(file_path: Path, dry_run: bool = False) -> bool:
+    """Normalize a single recording file. Returns True if file was modified."""
+    with open(file_path) as f:
+        recording = json.load(f)
+
+    # Extract request hash from filename (first 12 chars)
+    request_hash = file_path.stem.split("-")[-1] if "-" in file_path.stem else file_path.stem
+
+    modified = False
+    old_recording = json.dumps(recording, sort_keys=True)
+
+    # NOTE: We do NOT normalize request body here because that would change the request hash
+    # and break recording lookups. The recorder will normalize tool_call_ids in future recordings.
+
+    # Normalize response body
+    if "response" in recording and "body" in recording["response"]:
+        body = recording["response"]["body"]
+
+        if isinstance(body, list):
+            # Handle streaming responses (list of chunks)
+            for chunk in body:
+                if isinstance(chunk, dict) and "__data__" in chunk:
+                    normalize_response_data(chunk["__data__"], request_hash)
+        elif isinstance(body, dict) and "__data__" in body:
+            # Handle single response
+            normalize_response_data(body["__data__"], request_hash)
+
+    # Check if anything changed
+    new_recording = json.dumps(recording, sort_keys=True)
+    modified = old_recording != new_recording
+
+    if modified and not dry_run:
+        with open(file_path, "w") as f:
+            json.dump(recording, f, indent=2)
+            f.write("\n")
+
+    return modified
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Normalize recording files to reduce git diff noise")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be changed without modifying files")
+    args = parser.parse_args()
+
+    recordings_dir = Path(__file__).parent.parent / "tests/integration/recordings/responses"
+
+    if not recordings_dir.exists():
+        print(f"Recordings directory not found: {recordings_dir}")
+        return 1
+
+    modified_count = 0
+    total_count = 0
+
+    for file_path in sorted(recordings_dir.glob("*.json")):
+        total_count += 1
+        was_modified = normalize_recording_file(file_path, dry_run=args.dry_run)
+
+        if was_modified:
+            modified_count += 1
+            status = "[DRY RUN] Would normalize" if args.dry_run else "Normalized"
+            print(f"{status}: {file_path.name}")
+
+    print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary: {modified_count}/{total_count} files modified")
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
--- a/scripts/telemetry/grafana-datasources.yaml
+++ b/scripts/telemetry/grafana-datasources.yaml
@ -0,0 +1,15 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+
+  - name: Jaeger
+    type: jaeger
+    access: proxy
+    url: http://jaeger:16686
+    editable: true
--- a/scripts/telemetry/otel-collector-config.yaml
+++ b/scripts/telemetry/otel-collector-config.yaml
@ -0,0 +1,40 @@
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+processors:
+  batch:
+    timeout: 1s
+    send_batch_size: 1024
+
+exporters:
+  # Export traces to Jaeger
+  otlp/jaeger:
+    endpoint: jaeger:4317
+    tls:
+      insecure: true
+
+  # Export metrics to Prometheus
+  prometheus:
+    endpoint: 0.0.0.0:9464
+    namespace: llama_stack
+
+  # Debug exporter for troubleshooting
+  debug:
+    verbosity: detailed
+
+service:
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [otlp/jaeger, debug]
+
+    metrics:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [prometheus, debug]
--- a/scripts/telemetry/prometheus.yml
+++ b/scripts/telemetry/prometheus.yml
@ -0,0 +1,12 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: 'otel-collector'
+    static_configs:
+      - targets: ['otel-collector:9464']
--- a/scripts/telemetry/setup_telemetry.sh
+++ b/scripts/telemetry/setup_telemetry.sh
@ -17,6 +17,7 @@
 set -Eeuo pipefail

 CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

 echo "🚀 Setting up telemetry stack for Llama Stack using Podman..."

@ -53,7 +54,7 @@ $CONTAINER_RUNTIME run -d --name otel-collector \
  -p 4317:4317 \
  -p 9464:9464 \
  -p 13133:13133 \
-  -v $(pwd)/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z \
+  -v "$SCRIPT_DIR/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z" \
  docker.io/otel/opentelemetry-collector-contrib:latest \
  --config /etc/otel-collector-config.yaml

@ -62,7 +63,7 @@ echo "📈 Starting Prometheus..."
 $CONTAINER_RUNTIME run -d --name prometheus \
  --network llama-telemetry \
  -p 9090:9090 \
-  -v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml:Z \
+  -v "$SCRIPT_DIR/prometheus.yml:/etc/prometheus/prometheus.yml:Z" \
  docker.io/prom/prometheus:latest \
  --config.file=/etc/prometheus/prometheus.yml \
  --storage.tsdb.path=/prometheus \
@ -72,13 +73,15 @@ $CONTAINER_RUNTIME run -d --name prometheus \
  --web.enable-lifecycle

 # Start Grafana
+# Note: Using 11.0.0 because grafana:latest arm64 image has a broken /run.sh (0 bytes)
 echo "📊 Starting Grafana..."
 $CONTAINER_RUNTIME run -d --name grafana \
  --network llama-telemetry \
  -p 3000:3000 \
  -e GF_SECURITY_ADMIN_PASSWORD=admin \
  -e GF_USERS_ALLOW_SIGN_UP=false \
-  docker.io/grafana/grafana:latest
+  -v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
+  docker.io/grafana/grafana:11.0.0

 # Wait for services to start
 echo "⏳ Waiting for services to start..."
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -125,21 +125,28 @@ pytest -s -v tests/integration/vector_io/ \

 ## Recording Modes

-The testing system supports three modes controlled by environment variables:
+The testing system supports four modes controlled by environment variables:

 ### REPLAY Mode (Default)
 Uses cached responses instead of making API calls:
 ```bash
 pytest tests/integration/
 ```
+
+### RECORD-IF-MISSING Mode (Recommended for adding new tests)
+Records only when no recording exists, otherwise replays. This is the preferred mode for iterative development:
+```bash
+pytest tests/integration/inference/test_new_feature.py --inference-mode=record-if-missing
+```
+
 ### RECORD Mode
-Captures API interactions for later replay:
+**Force-records all API interactions**, overwriting existing recordings. Use with caution as this will re-record everything:
 ```bash
 pytest tests/integration/inference/test_new_feature.py --inference-mode=record
 ```

 ### LIVE Mode
-Tests make real API calls (but not recorded):
+Tests make real API calls (not recorded):
 ```bash
 pytest tests/integration/ --inference-mode=live
 ```
--- a/tests/integration/agents/recordings/000506671ad4807d1bf577100f7af1cc99d782d2c6eb32892c3f6f7c6157bf93.json
+++ b/tests/integration/agents/recordings/000506671ad4807d1bf577100f7af1cc99d782d2c6eb32892c3f6f7c6157bf93.json
@ -0,0 +1,58 @@
+{
+  "test_id": "tests/integration/agents/test_agents.py::test_custom_tool_infinite_loop[ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama-guard3:1b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Task: Check if there is unsafe content in 'Tool' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: Get the boiling point of polyjuice with a tool call.\n\nAssistant: \n\nTool: Error when running tool: get_boiling_point() missing 1 required positional argument: 'liquid_name'\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST Tool message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
+        }
+      ],
+      "stream": false,
+      "temperature": 0.0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama-guard3:1b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-000506671ad4",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "safe",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama-guard3:1b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 2,
+          "prompt_tokens": 422,
+          "total_tokens": 424,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/agents/recordings/00f8a71ccb939737ed72a289eede62998c6882727519858bbd5954307d10a673.json
+++ b/tests/integration/agents/recordings/00f8a71ccb939737ed72a289eede62998c6882727519858bbd5954307d10a673.json
@ -28,7 +28,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -43,7 +43,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -54,7 +54,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -69,7 +69,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -80,7 +80,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -95,7 +95,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -106,7 +106,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -121,7 +121,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -132,7 +132,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -147,7 +147,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -158,7 +158,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -173,7 +173,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -184,7 +184,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -199,7 +199,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -210,7 +210,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -225,7 +225,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -236,7 +236,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -251,7 +251,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -262,7 +262,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -277,7 +277,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -288,7 +288,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -303,7 +303,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -314,7 +314,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -329,7 +329,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -340,7 +340,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -355,7 +355,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -366,7 +366,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -381,7 +381,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -392,7 +392,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -407,7 +407,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -418,7 +418,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -433,7 +433,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -444,7 +444,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -459,7 +459,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -470,7 +470,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -485,7 +485,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437810,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -496,7 +496,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -511,7 +511,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437811,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -522,7 +522,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-130",
+          "id": "rec-044dcd8fdeb1",
          "choices": [
            {
              "delta": {
@ -537,7 +537,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759437811,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
--- a/tests/integration/agents/recordings/06fbbb88ed5ed37e25609755e1cb348578dbddb2c8b76dafa5025bb3068c94ea.json
+++ b/tests/integration/agents/recordings/06fbbb88ed5ed37e25609755e1cb348578dbddb2c8b76dafa5025bb3068c94ea.json
@ -0,0 +1,58 @@
+{
+  "test_id": "tests/integration/agents/test_agents.py::test_custom_tool[ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama-guard3:1b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Task: Check if there is unsafe content in 'Tool' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: What is the boiling point of the liquid polyjuice in celsius?\n\nAssistant: \n\nTool: Error when running tool: get_boiling_point() got an unexpected keyword argument 'liquid'\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST Tool message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
+        }
+      ],
+      "stream": false,
+      "temperature": 0.0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama-guard3:1b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-06fbbb88ed5e",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "safe",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama-guard3:1b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 2,
+          "prompt_tokens": 421,
+          "total_tokens": 423,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/agents/recordings/095b37e65a5a78904f225bdefc904d3e20145a4dab1be0cf07d17a416d85e58d.json
+++ b/tests/integration/agents/recordings/095b37e65a5a78904f225bdefc904d3e20145a4dab1be0cf07d17a416d85e58d.json
@ -73,7 +73,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -88,7 +88,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -99,7 +99,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -114,7 +114,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -125,7 +125,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -140,7 +140,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -151,7 +151,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -166,7 +166,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -177,7 +177,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -192,7 +192,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -203,7 +203,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -218,7 +218,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -229,7 +229,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -244,7 +244,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -255,7 +255,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -270,7 +270,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -281,7 +281,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -296,7 +296,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -307,7 +307,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -322,7 +322,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -333,7 +333,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -348,7 +348,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -359,7 +359,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -374,7 +374,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441160,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -385,7 +385,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-67",
+          "id": "rec-4a32ce3da3ce",
          "choices": [
            {
              "delta": {
@ -400,7 +400,7 @@
              "logprobs": null
            }
          ],
-          "created": 1759441161,
+          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
--- a/tests/integration/agents/recordings/0e44d91278f78682708e855e3161c031454d77a011ec80d7e64b3b8969ad00b2.json
+++ b/tests/integration/agents/recordings/0e44d91278f78682708e855e3161c031454d77a011ec80d7e64b3b8969ad00b2.json
--- a/tests/integration/agents/recordings/13fac3724cd626a119153f60fa56bb54955fe0b10f5c4102b78e2d428b5ffc7a.json
+++ b/tests/integration/agents/recordings/13fac3724cd626a119153f60fa56bb54955fe0b10f5c4102b78e2d428b5ffc7a.json
@ -21,7 +21,7 @@
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
-        "id": "chatcmpl-912",
+        "id": "rec-b58e35a624b0",
        "choices": [
          {
            "finish_reason": "stop",
@ -38,7 +38,7 @@
            }
          }
        ],
-        "created": 1759437811,
+        "created": 0,
        "model": "llama-guard3:1b",
        "object": "chat.completion",
        "service_tier": null,
--- a/tests/integration/agents/recordings/176bcef706a9e6f02e47d884df602092bd43906c19747790e7a4ad3aab7ef9f3.json
+++ b/tests/integration/agents/recordings/176bcef706a9e6f02e47d884df602092bd43906c19747790e7a4ad3aab7ef9f3.json
@ -0,0 +1,104 @@
+{
+  "test_id": "tests/integration/agents/test_agents.py::test_create_turn_response[ollama/llama3.2:3b-instruct-fp16-client_tools1]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant"
+        },
+        {
+          "role": "user",
+          "content": "Call get_boiling_point_with_metadata tool and answer What is the boiling point of polyjuice?"
+        }
+      ],
+      "max_tokens": 512,
+      "stream": true,
+      "temperature": 0.0001,
+      "tool_choice": "auto",
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_boiling_point_with_metadata",
+            "description": "Returns the boiling point of a liquid in Celcius or Fahrenheit"
+          }
+        }
+      ],
+      "top_p": 0.9
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-176bcef706a9",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": [
+                  {
+                    "index": 0,
+                    "id": "call_wxinam9c",
+                    "function": {
+                      "arguments": "{}",
+                      "name": "get_boiling_point_with_metadata"
+                    },
+                    "type": "function"
+                  }
+                ]
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-176bcef706a9",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "tool_calls",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/tests/integration/agents/recordings/1a0d3109cf92111ed4cb061a857dee0b99fa1e0b27934de1e6c5d29c03026626.json
+++ b/tests/integration/agents/recordings/1a0d3109cf92111ed4cb061a857dee0b99fa1e0b27934de1e6c5d29c03026626.json
@ -0,0 +1,58 @@
+{
+  "test_id": "tests/integration/agents/test_agents.py::test_tool_choice_none[ollama/llama3.2:3b-instruct-fp16]",
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama-guard3:1b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: What is the boiling point of the liquid polyjuice in celsius?\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST User message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
+        }
+      ],
+      "stream": false,
+      "temperature": 0.0
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama-guard3:1b"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
+      "__data__": {
+        "id": "rec-1a0d3109cf92",
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+              "content": "safe",
+              "refusal": null,
+              "role": "assistant",
+              "annotations": null,
+              "audio": null,
+              "function_call": null,
+              "tool_calls": null
+            }
+          }
+        ],
+        "created": 0,
+        "model": "llama-guard3:1b",
+        "object": "chat.completion",
+        "service_tier": null,
+        "system_fingerprint": "fp_ollama",
+        "usage": {
+          "completion_tokens": 2,
+          "prompt_tokens": 398,
+          "total_tokens": 400,
+          "completion_tokens_details": null,
+          "prompt_tokens_details": null
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/agents/recordings/1d82e9439ae3.json
+++ b/tests/integration/agents/recordings/1d82e9439ae3.json
@ -0,0 +1,388 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
+    "headers": {},
+    "body": {
+      "model": "llama3.2:3b-instruct-fp16",
+      "messages": [
+        {
+          "role": "system",
+          "content": "You are a helpful assistant"
+        },
+        {
+          "role": "user",
+          "content": "Call get_boiling_point tool and answer What is the boiling point of polyjuice?"
+        },
+        {
+          "role": "assistant",
+          "content": "",
+          "tool_calls": [
+            {
+              "id": "toolcall-1d82e943-0",
+              "type": "function",
+              "function": {
+                "name": "get_boiling_point",
+                "arguments": "{\"celcius\":null,\"liquid_name\":\"polyjuice\"}"
+              }
+            }
+          ]
+        },
+        {
+          "role": "tool",
+          "tool_call_id": "toolcall-1d82e943-0",
+          "content": "-212"
+        }
+      ],
+      "max_tokens": 512,
+      "stream": true,
+      "temperature": 0.0001,
+      "tool_choice": "auto",
+      "tools": [
+        {
+          "type": "function",
+          "function": {
+            "name": "get_boiling_point",
+            "description": "Returns the boiling point of a liquid in Celcius or Fahrenheit.",
+            "parameters": {
+              "type": "object",
+              "properties": {
+                "liquid_name": {
+                  "type": "string",
+                  "description": "The name of the liquid"
+                },
+                "celcius": {
+                  "type": "boolean",
+                  "description": "Whether to return the boiling point in Celcius"
+                }
+              },
+              "required": [
+                "liquid_name"
+              ]
+            }
+          }
+        }
+      ],
+      "top_p": 0.9
+    },
+    "endpoint": "/v1/chat/completions",
+    "model": "llama3.2:3b-instruct-fp16"
+  },
+  "response": {
+    "body": [
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": "The",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": " boiling",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": " point",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": " of",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": " poly",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": "ju",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": "ice",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": " is",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": " -",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": "212",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": ".",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": null,
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      },
+      {
+        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
+        "__data__": {
+          "id": "rec-1d82e9439ae3",
+          "choices": [
+            {
+              "delta": {
+                "content": "",
+                "function_call": null,
+                "refusal": null,
+                "role": "assistant",
+                "tool_calls": null
+              },
+              "finish_reason": "stop",
+              "index": 0,
+              "logprobs": null
+            }
+          ],
+          "created": 0,
+          "model": "llama3.2:3b-instruct-fp16",
+          "object": "chat.completion.chunk",
+          "service_tier": null,
+          "system_fingerprint": "fp_ollama",
+          "usage": null
+        }
+      }
+    ],
+    "is_streaming": true
+  }
+}
--- a/Show more
+++ b/Show more