Merge branch 'main' into content-extension

2025-10-04 04:04:14 +00:00 · 2025-09-07 12:38:35 -06:00 · 2025-09-07 12:38:35 -06:00 · 354ed48598
commit 354ed48598
parent 4c1f187c71 78cab5331a
227 changed files with 21224 additions and 10798 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -2,13 +2,6 @@ name: 'Run and Record Tests'
 description: 'Run integration tests and handle recording/artifact upload'
 inputs:
  test-subdirs:
    description: 'Comma-separated list of test subdirectories to run'
    required: true
  test-pattern:
    description: 'Regex pattern to pass to pytest -k'
    required: false
    default: ''
  stack-config:
    description: 'Stack configuration to use'
    required: true
@ -18,10 +11,18 @@ inputs:
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
-  run-vision-tests:
+  test-suite:
-    description: 'Whether to run vision tests'
+    description: 'Test suite to use: base, responses, vision, etc.'
    required: false
-    default: 'false'
+    default: ''
  test-subdirs:
    description: 'Comma-separated list of test subdirectories to run; overrides test-suite'
    required: false
    default: ''
  test-pattern:
    description: 'Regex pattern to pass to pytest -k'
    required: false
    default: ''
 runs:
  using: 'composite'
@ -42,7 +43,7 @@ runs:
          --test-subdirs '${{ inputs.test-subdirs }}' \
          --test-pattern '${{ inputs.test-pattern }}' \
          --inference-mode '${{ inputs.inference-mode }}' \
-          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
+          --test-suite '${{ inputs.test-suite }}' \
          | tee pytest-${{ inputs.inference-mode }}.log
@ -57,12 +58,7 @@ runs:
          echo "New recordings detected, committing and pushing"
          git add tests/integration/recordings/
-          if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
+          git commit -m "Recordings update from CI (test-suite: ${{ inputs.test-suite }})"
            git commit -m "Recordings update from CI (vision)"
          else
            git commit -m "Recordings update from CI"
          fi
          git fetch origin ${{ github.ref_name }}
          git rebase origin/${{ github.ref_name }}
          echo "Rebased successfully"
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,17 +1,17 @@
 name: Setup Ollama
 description: Start Ollama
 inputs:
-  run-vision-tests:
+  test-suite:
-    description: 'Run vision tests: "true" or "false"'
+    description: 'Test suite to use: base, responses, vision, etc.'
    required: false
-    default: 'false'
+    default: ''
 runs:
  using: "composite"
  steps:
    - name: Start Ollama
      shell: bash
      run: |
-        if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
+        if [ "${{ inputs.test-suite }}" == "vision" ]; then
          image="ollama-with-vision-model"
        else
          image="ollama-with-models"
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -12,10 +12,10 @@ inputs:
    description: 'Provider to setup (ollama or vllm)'
    required: true
    default: 'ollama'
-  run-vision-tests:
+  test-suite:
-    description: 'Whether to setup provider for vision tests'
+    description: 'Test suite to use: base, responses, vision, etc.'
    required: false
-    default: 'false'
+    default: ''
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
@ -33,7 +33,7 @@ runs:
      if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }}
      uses: ./.github/actions/setup-ollama
      with:
-        run-vision-tests: ${{ inputs.run-vision-tests }}
+        test-suite: ${{ inputs.test-suite }}
    - name: Setup vllm
      if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }}
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -5,10 +5,11 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Name | File | Purpose |
 | ---- | ---- | ------- |
 | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
 | API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. |
 | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
 | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
 | SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
-| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode |
+| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -0,0 +1,57 @@
 # API Conformance Tests
 # This workflow ensures that API changes maintain backward compatibility and don't break existing integrations
 # It runs schema validation and OpenAPI diff checks to catch breaking changes early
 name: API Conformance Tests
 run-name: Run the API Conformance test suite on the changes.
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    types: [opened, synchronize, reopened]
    paths:
      - 'llama_stack/**'
      - '!llama_stack/ui/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - '.github/workflows/conformance.yml' # This workflow itself
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
  # Cancel in-progress runs when new commits are pushed to avoid wasting CI resources
  cancel-in-progress: true
 jobs:
  # Job to check if API schema changes maintain backward compatibility
  check-schema-compatibility:
    runs-on: ubuntu-latest
    steps:
      # Using specific version 4.1.7 because 5.0.0 fails when trying to run this locally using `act`
      # This ensures consistent behavior between local testing and CI
      - name: Checkout PR Code
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
      # Checkout the base branch to compare against (usually main)
      # This allows us to diff the current changes against the previous state
      - name: Checkout Base Branch
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          ref: ${{ github.event.pull_request.base.ref }}
          path: 'base'
      # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
      - name: Install oasdiff
        run: |
          curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
      # Run oasdiff to detect breaking changes in the API specification
      # This step will fail if incompatible changes are detected, preventing breaking changes from being merged
      - name: Run OpenAPI Breaking Change Diff
        run: |
          oasdiff breaking --fail-on ERR base/docs/_static/llama-stack-spec.yaml docs/_static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \
          --match-path '^/v1/vector-io' \
          --match-path '^/v1/vector-dbs'
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -1,6 +1,6 @@
 name: Integration Tests (Replay)
-run-name: Run the integration test suite from tests/integration in replay mode
+run-name: Run the integration test suites from tests/integration in replay mode
 on:
  push:
@ -32,14 +32,6 @@ on:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
      test-subdirs:
        description: 'Comma-separated list of test subdirectories to run'
        type: string
        default: ''
      test-pattern:
        description: 'Regex pattern to pass to pytest -k'
        type: string
        default: ''
 concurrency:
  # Skip concurrency for pushes to main - each commit should be tested independently
@ -50,7 +42,7 @@ jobs:
  run-replay-mode-tests:
    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
+    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.test-suite) }}
    strategy:
      fail-fast: false
@ -61,7 +53,7 @@ jobs:
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        run-vision-tests: [true, false]
+        test-suite: [base, vision]
    steps:
      - name: Checkout repository
@ -73,15 +65,13 @@ jobs:
          python-version: ${{ matrix.python-version }}
          client-version: ${{ matrix.client-version }}
          provider: ${{ matrix.provider }}
-          run-vision-tests: ${{ matrix.run-vision-tests }}
+          test-suite: ${{ matrix.test-suite }}
          inference-mode: 'replay'
      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
          test-subdirs: ${{ inputs.test-subdirs }}
          test-pattern: ${{ inputs.test-pattern }}
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
          provider: ${{ matrix.provider }}
          inference-mode: 'replay'
-          run-vision-tests: ${{ matrix.run-vision-tests }}
+          test-suite: ${{ matrix.test-suite }}
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -10,18 +10,18 @@ run-name: Run the integration test suite from tests/integration
 on:
  workflow_dispatch:
    inputs:
      test-subdirs:
        description: 'Comma-separated list of test subdirectories to run'
        type: string
        default: ''
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
-      run-vision-tests:
+      test-suite:
-        description: 'Whether to run vision tests'
+        description: 'Test suite to use: base, responses, vision, etc.'
-        type: boolean
+        type: string
-        default: false
+        default: ''
      test-subdirs:
        description: 'Comma-separated list of test subdirectories to run; overrides test-suite'
        type: string
        default: ''
      test-pattern:
        description: 'Regex pattern to pass to pytest -k'
        type: string
@ -38,11 +38,11 @@ jobs:
      - name: Echo workflow inputs
        run: |
          echo "::group::Workflow Inputs"
          echo "test-subdirs: ${{ inputs.test-subdirs }}"
          echo "test-provider: ${{ inputs.test-provider }}"
          echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
          echo "test-pattern: ${{ inputs.test-pattern }}"
          echo "branch: ${{ github.ref_name }}"
          echo "test-provider: ${{ inputs.test-provider }}"
          echo "test-suite: ${{ inputs.test-suite }}"
          echo "test-subdirs: ${{ inputs.test-subdirs }}"
          echo "test-pattern: ${{ inputs.test-pattern }}"
          echo "::endgroup::"
      - name: Checkout repository
@ -56,15 +56,15 @@ jobs:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
          provider: ${{ inputs.test-provider || 'ollama' }}
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          test-suite: ${{ inputs.test-suite }}
          inference-mode: 'record'
      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
          test-pattern: ${{ inputs.test-pattern }}
          test-subdirs: ${{ inputs.test-subdirs }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          provider: ${{ inputs.test-provider || 'ollama' }}
          inference-mode: 'record'
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          test-suite: ${{ inputs.test-suite }}
          test-subdirs: ${{ inputs.test-subdirs }}
          test-pattern: ${{ inputs.test-pattern }}
--- a/.gitignore
+++ b/.gitignore
@ -26,5 +26,7 @@ venv/
 pytest-report.xml
 .coverage
 .python-version
 AGENTS.md
 server.log
 CLAUDE.md
 .claude/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -86,7 +86,7 @@ repos:
        language: python
        pass_filenames: false
        require_serial: true
-        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
+        files: ^llama_stack/distributions/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
      - id: provider-codegen
        name: Provider Codegen
        additional_dependencies:
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -4129,7 +4129,7 @@
                "tags": [
                    "Files"
                ],
-                "description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.",
+                "description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file. Expected expires_after[anchor] = \"created_at\", expires_after[seconds] = <int>. Seconds must be between 3600 and 2592000 (1 hour to 30 days).",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -4143,11 +4143,33 @@
                                    },
                                    "purpose": {
                                        "$ref": "#/components/schemas/OpenAIFilePurpose"
                                    },
                                    "expires_after_anchor": {
                                        "oneOf": [
                                            {
                                                "type": "string"
                                            },
                                            {
                                                "type": "null"
                                            }
                                        ]
                                    },
                                    "expires_after_seconds": {
                                        "oneOf": [
                                            {
                                                "type": "integer"
                                            },
                                            {
                                                "type": "null"
                                            }
                                        ]
                                    }
                                },
                                "required": [
                                    "file",
-                                    "purpose"
+                                    "purpose",
                                    "expires_after_anchor",
                                    "expires_after_seconds"
                                ]
                            }
                        }
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -2933,6 +2933,10 @@ paths:
        - file: The File object (not file name) to be uploaded.
        - purpose: The intended purpose of the uploaded file.
        - expires_after: Optional form values describing expiration for the file.
        Expected expires_after[anchor] = "created_at", expires_after[seconds] = <int>.
        Seconds must be between 3600 and 2592000 (1 hour to 30 days).
      parameters: []
      requestBody:
        content:
@ -2945,9 +2949,19 @@ paths:
                  format: binary
                purpose:
                  $ref: '#/components/schemas/OpenAIFilePurpose'
                expires_after_anchor:
                  oneOf:
                    - type: string
                    - type: 'null'
                expires_after_seconds:
                  oneOf:
                    - type: integer
                    - type: 'null'
              required:
                - file
                - purpose
                - expires_after_anchor
                - expires_after_seconds
        required: true
  /v1/openai/v1/models:
    get:
--- a/docs/source/contributing/testing/record-replay.md
+++ b/docs/source/contributing/testing/record-replay.md
@ -40,18 +40,15 @@ The system patches OpenAI and Ollama client methods to intercept calls before th
 ### Storage Architecture
-Recordings use a two-tier storage system optimized for both speed and debuggability:
+Recordings are stored as JSON files in the recording directory. They are looked up by their request hash.
 ```
 recordings/
 ├── index.sqlite          # Fast lookup by request hash
 └── responses/
    ├── abc123def456.json  # Individual response files
    └── def789ghi012.json
 ```
 **SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
 **JSON files** store complete request/response pairs in human-readable format for debugging.
 ## Recording Modes
@ -166,8 +163,8 @@ This preserves type safety - when replayed, you get the same Pydantic objects wi
 Control recording behavior globally:
 ```bash
-export LLAMA_STACK_TEST_INFERENCE_MODE=replay
+export LLAMA_STACK_TEST_INFERENCE_MODE=replay   # this is the default
-export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
+export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings   # default is tests/integration/recordings
 pytest tests/integration/
 ```
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo
 apis:
 - agents
 - inference
 - safety
 - telemetry
 - tool_runtime
 - vector_io
@ -30,6 +31,11 @@ providers:
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -95,6 +101,8 @@ models:
 - model_id: ${env.INFERENCE_MODEL}
  provider_id: vllm-inference
  model_type: llm
 shields:
 - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
 vector_dbs: []
 datasets: []
 scoring_fns: []
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -50,6 +50,7 @@ The following models are available by default:
 - `meta/llama-3.2-11b-vision-instruct `
 - `meta/llama-3.2-90b-vision-instruct `
 - `meta/llama-3.3-70b-instruct `
 - `nvidia/vila `
 - `nvidia/llama-3.2-nv-embedqa-1b-v2 `
 - `nvidia/nv-embedqa-e5-v5 `
 - `nvidia/nv-embedqa-mistral-7b-v2 `
--- a/docs/source/getting_started/demo_script.py
+++ b/docs/source/getting_started/demo_script.py
@ -18,12 +18,13 @@ embedding_model_id = (
 ).identifier
 embedding_dimension = em.metadata["embedding_dimension"]
-_ = client.vector_dbs.register(
+vector_db = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model_id,
    embedding_dimension=embedding_dimension,
    provider_id="faiss",
 )
 vector_db_id = vector_db.identifier
 source = "https://www.paulgraham.com/greatwork.html"
 print("rag_tool> Ingesting document:", source)
 document = RAGDocument(
@ -35,7 +36,7 @@ document = RAGDocument(
 client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=50,
+    chunk_size_in_tokens=100,
 )
 agent = Agent(
    client,
--- a/docs/source/providers/inference/remote_bedrock.md
+++ b/docs/source/providers/inference/remote_bedrock.md
@ -15,8 +15,8 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
 | `profile_name` | `str \| None` | No |  | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
 | `total_max_attempts` | `int \| None` | No |  | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
 | `retry_mode` | `str \| None` | No |  | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
-| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
+| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
-| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
+| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
 | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
 ## Sample Configuration
--- a/docs/source/providers/safety/remote_bedrock.md
+++ b/docs/source/providers/safety/remote_bedrock.md
@ -15,8 +15,8 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
 | `profile_name` | `str \| None` | No |  | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
 | `total_max_attempts` | `int \| None` | No |  | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
 | `retry_mode` | `str \| None` | No |  | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
-| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
+| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
-| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
+| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
 | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
 ## Sample Configuration
--- a/docs/source/providers/vector_io/remote_pgvector.md
+++ b/docs/source/providers/vector_io/remote_pgvector.md
@ -12,6 +12,60 @@ That means you'll get fast and efficient vector retrieval.
 - Easy to use
 - Fully integrated with Llama Stack
 There are three implementations of search for PGVectoIndex available:
 1. Vector Search:
 - How it works:
  - Uses PostgreSQL's vector extension (pgvector) to perform similarity search
  - Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
  - Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
 -Characteristics:
  - Semantic understanding - finds documents similar in meaning even if they don't share keywords
  - Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
  - Best for: Finding conceptually related content, handling synonyms, cross-language search
 2. Keyword Search
 - How it works:
  - Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
  - Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
  - Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
 - Characteristics:
  - Lexical matching - finds exact keyword matches and variations
  - Uses GIN (Generalized Inverted Index) for fast text search performance
  - Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
  - Best for: Exact term matching, proper names, technical terms, Boolean-style queries
 3. Hybrid Search
 - How it works:
  - Combines both vector and keyword search results
  - Runs both searches independently, then merges results using configurable reranking
 - Two reranking strategies available:
    - Reciprocal Rank Fusion (RRF) - (default: 60.0)
    - Weighted Average - (default: 0.5)
 - Characteristics:
  - Best of both worlds: semantic understanding + exact matching
  - Documents appearing in both searches get boosted scores
  - Configurable balance between semantic and lexical matching
  - Best for: General-purpose search where you want both precision and recall
 4. Database Schema
 The PGVector implementation stores data optimized for all three search types:
 CREATE TABLE vector_store_xxx (
    id TEXT PRIMARY KEY,
    document JSONB,                    -- Original document
    embedding vector(dimension),        -- For vector search
    content_text TEXT,                 -- Raw text content
    tokenized_content TSVECTOR          -- For keyword search
 );
 -- Indexes for performance
 CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content);  -- Keyword search
 -- Vector index created automatically by pgvector
 ## Usage
 To use PGVector in your Llama Stack project, follow these steps:
@ -20,6 +74,25 @@ To use PGVector in your Llama Stack project, follow these steps:
 2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
 3. Start storing and querying vectors.
 ## This is an example how you can set up your environment for using PGVector
 1. Export env vars:
 ```bash
 export ENABLE_PGVECTOR=true
 export PGVECTOR_HOST=localhost
 export PGVECTOR_PORT=5432
 export PGVECTOR_DB=llamastack
 export PGVECTOR_USER=llamastack
 export PGVECTOR_PASSWORD=llamastack
 ```
 2. Create DB:
 ```bash
 psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
 psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
 psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
 ```
 ## Installation
 You can install PGVector using docker:
--- a/docs/source/providers/vector_io/remote_weaviate.md
+++ b/docs/source/providers/vector_io/remote_weaviate.md
@ -17,6 +17,7 @@ Weaviate supports:
 - Metadata filtering
 - Multi-modal retrieval
 ## Usage
 To use Weaviate in your Llama Stack project, follow these steps:
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@ -478,7 +478,6 @@ llama-stack-client scoring_functions list
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
 ┃ identifier                                 ┃ provider_id  ┃ description                                                   ┃ type             ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
 │ basic::bfcl                                │ basic        │ BFCL complex scoring                                          │ scoring_function │
 │ basic::docvqa                              │ basic        │ DocVQA Visual Question & Answer scoring function              │ scoring_function │
 │ basic::equality                            │ basic        │ Returns 1.0 if the input is equal to the target, 0.0          │ scoring_function │
 │                                            │              │ otherwise.                                                    │                  │
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -5,10 +5,10 @@
 # the root directory of this source tree.
 from enum import StrEnum
-from typing import Annotated, Literal, Protocol, runtime_checkable
+from typing import Annotated, ClassVar, Literal, Protocol, runtime_checkable
 from fastapi import File, Form, Response, UploadFile
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from llama_stack.apis.common.responses import Order
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -49,6 +49,23 @@ class OpenAIFileObject(BaseModel):
    purpose: OpenAIFilePurpose
@json_schema_type
 class ExpiresAfter(BaseModel):
    """
    Control expiration of uploaded files.
    Params:
     - anchor, must be "created_at"
     - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
    """
    MIN: ClassVar[int] = 3600  # 1 hour
    MAX: ClassVar[int] = 2592000  # 30 days
    anchor: Literal["created_at"]
    seconds: int = Field(..., ge=3600, le=2592000)
@json_schema_type
 class ListOpenAIFileResponse(BaseModel):
    """
@ -92,6 +109,9 @@ class Files(Protocol):
        self,
        file: Annotated[UploadFile, File()],
        purpose: Annotated[OpenAIFilePurpose, Form()],
        expires_after_anchor: Annotated[str | None, Form(alias="expires_after[anchor]")] = None,
        expires_after_seconds: Annotated[int | None, Form(alias="expires_after[seconds]")] = None,
        # TODO: expires_after is producing strange openapi spec, params are showing up as a required w/ oneOf being null
    ) -> OpenAIFileObject:
        """
        Upload a file that can be used across various endpoints.
@ -99,6 +119,7 @@ class Files(Protocol):
        The file upload should be a multipart form request with:
        - file: The File object (not file name) to be uploaded.
        - purpose: The intended purpose of the uploaded file.
        - expires_after: Optional form values describing expiration for the file. Expected expires_after[anchor] = "created_at", expires_after[seconds] = <int>. Seconds must be between 3600 and 2592000 (1 hour to 30 days).
        :param file: The uploaded file object containing content and metadata (filename, content_type, etc.).
        :param purpose: The intended purpose of the uploaded file (e.g., "assistants", "fine-tune").
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -284,7 +284,15 @@ async def instantiate_providers(
        if provider.provider_id is None:
            continue
-        deps = {a: impls[a] for a in provider.spec.api_dependencies}
+        try:
            deps = {a: impls[a] for a in provider.spec.api_dependencies}
        except KeyError as e:
            missing_api = e.args[0]
            raise RuntimeError(
                f"Failed to resolve '{provider.spec.api.value}' provider '{provider.provider_id}' of type '{provider.spec.provider_type}': "
                f"required dependency '{missing_api.value}' is not available. "
                f"Please add a '{missing_api.value}' provider to your configuration or check if the provider is properly configured."
            ) from e
        for a in provider.spec.optional_api_dependencies:
            if a in impls:
                deps[a] = impls[a]
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -527,7 +527,7 @@ class InferenceRouter(Inference):
        # Store the response with the ID that will be returned to the client
        if self.store:
-            await self.store.store_chat_completion(response, messages)
+            asyncio.create_task(self.store.store_chat_completion(response, messages))
        if self.telemetry:
            metrics = self._construct_metrics(
@ -755,7 +755,7 @@ class InferenceRouter(Inference):
                            choices_data[idx] = {
                                "content_parts": [],
                                "tool_calls_builder": {},
-                                "finish_reason": None,
+                                "finish_reason": "stop",
                                "logprobs_content_parts": [],
                            }
                        current_choice_data = choices_data[idx]
@ -855,4 +855,4 @@ class InferenceRouter(Inference):
                    object="chat.completion",
                )
                logger.debug(f"InferenceRouter.completion_response: {final_response}")
-                await self.store.store_chat_completion(final_response, messages)
+                asyncio.create_task(self.store.store_chat_completion(final_response, messages))
--- a/llama_stack/core/routing_tables/vector_dbs.py
+++ b/llama_stack/core/routing_tables/vector_dbs.py
@ -52,7 +52,6 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
        provider_vector_db_id: str | None = None,
        vector_db_name: str | None = None,
    ) -> VectorDB:
        provider_vector_db_id = provider_vector_db_id or vector_db_id
        if provider_id is None:
            if len(self.impls_by_provider_id) > 0:
                provider_id = list(self.impls_by_provider_id.keys())[0]
@ -69,14 +68,33 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
            raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
        if "embedding_dimension" not in model.metadata:
            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
        provider = self.impls_by_provider_id[provider_id]
        logger.warning(
            "VectorDB is being deprecated in future releases in favor of VectorStore. Please migrate your usage accordingly."
        )
        vector_store = await provider.openai_create_vector_store(
            name=vector_db_name or vector_db_id,
            embedding_model=embedding_model,
            embedding_dimension=model.metadata["embedding_dimension"],
            provider_id=provider_id,
            provider_vector_db_id=provider_vector_db_id,
        )
        vector_store_id = vector_store.id
        actual_provider_vector_db_id = provider_vector_db_id or vector_store_id
        logger.warning(
            f"Ignoring vector_db_id {vector_db_id} and using vector_store_id {vector_store_id} instead. Setting VectorDB {vector_db_id} to VectorDB.vector_db_name"
        )
        vector_db_data = {
-            "identifier": vector_db_id,
+            "identifier": vector_store_id,
            "type": ResourceType.vector_db.value,
            "provider_id": provider_id,
-            "provider_resource_id": provider_vector_db_id,
+            "provider_resource_id": actual_provider_vector_db_id,
            "embedding_model": embedding_model,
            "embedding_dimension": model.metadata["embedding_dimension"],
-            "vector_db_name": vector_db_name,
+            "vector_db_name": vector_store.name,
        }
        vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
        await self.register_object(vector_db)
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -132,15 +132,17 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
            },
        )
    elif isinstance(exc, ConflictError):
-        return HTTPException(status_code=409, detail=str(exc))
+        return HTTPException(status_code=httpx.codes.CONFLICT, detail=str(exc))
    elif isinstance(exc, ResourceNotFoundError):
-        return HTTPException(status_code=404, detail=str(exc))
+        return HTTPException(status_code=httpx.codes.NOT_FOUND, detail=str(exc))
    elif isinstance(exc, ValueError):
        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
    elif isinstance(exc, BadRequestError):
        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
    elif isinstance(exc, PermissionError | AccessDeniedError):
        return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
    elif isinstance(exc, ConnectionError | httpx.ConnectError):
        return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc))
    elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
        return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
    elif isinstance(exc, NotImplementedError):
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@ -105,12 +105,12 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
        method = getattr(impls[api], register_method)
        for obj in objects:
-            logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")
+            if hasattr(obj, "provider_id"):
-
+                # Do not register models on disabled providers
-            # Do not register models on disabled providers
+                if not obj.provider_id or obj.provider_id == "__disabled__":
-            if hasattr(obj, "provider_id") and (not obj.provider_id or obj.provider_id == "__disabled__"):
+                    logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
-                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
+                    continue
-                continue
+                logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")
            # we want to maintain the type information in arguments to method.
            # instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
--- a/llama_stack/distributions/ci-tests/ci_tests.py
+++ b/llama_stack/distributions/ci-tests/ci_tests.py
@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
 def get_distribution_template() -> DistributionTemplate:
-    template = get_starter_distribution_template()
+    template = get_starter_distribution_template(name="ci-tests")
    name = "ci-tests"
    template.name = name
    template.description = "CI tests for Llama Stack"
    return template
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -89,28 +89,28 @@ providers:
    config:
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/faiss_store.db
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec_registry.db
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/ci-tests}/milvus.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/milvus_registry.db
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests/}/chroma_remote_registry.db
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
@ -121,15 +121,15 @@ providers:
      password: ${env.PGVECTOR_PASSWORD:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/pgvector_registry.db
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ci-tests/files}
      metadata_store:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/llama_stack/distributions/nvidia/run.yaml
+++ b/llama_stack/distributions/nvidia/run.yaml
@ -134,6 +134,11 @@ models:
  provider_id: nvidia
  provider_model_id: meta/llama-3.3-70b-instruct
  model_type: llm
 - metadata: {}
  model_id: nvidia/vila
  provider_id: nvidia
  provider_model_id: nvidia/vila
  model_type: llm
 - metadata:
    embedding_dimension: 2048
    context_length: 8192
--- a/llama_stack/distributions/open-benchmark/open_benchmark.py
+++ b/llama_stack/distributions/open-benchmark/open_benchmark.py
@ -43,7 +43,7 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
            "openai",
            [
                ProviderModelEntry(
-                    provider_model_id="openai/gpt-4o",
+                    provider_model_id="gpt-4o",
                    model_type=ModelType.llm,
                )
            ],
@ -53,7 +53,7 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
            "anthropic",
            [
                ProviderModelEntry(
-                    provider_model_id="anthropic/claude-3-5-sonnet-latest",
+                    provider_model_id="claude-3-5-sonnet-latest",
                    model_type=ModelType.llm,
                )
            ],
@ -206,13 +206,6 @@ def get_distribution_template() -> DistributionTemplate:
                uri="huggingface://datasets/llamastack/math_500?split=test",
            ),
        ),
        DatasetInput(
            dataset_id="bfcl",
            purpose=DatasetPurpose.eval_messages_answer,
            source=URIDataSource(
                uri="huggingface://datasets/llamastack/bfcl_v3?split=train",
            ),
        ),
        DatasetInput(
            dataset_id="ifeval",
            purpose=DatasetPurpose.eval_messages_answer,
@ -250,11 +243,6 @@ def get_distribution_template() -> DistributionTemplate:
            dataset_id="math_500",
            scoring_functions=["basic::regex_parser_math_response"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-bfcl",
            dataset_id="bfcl",
            scoring_functions=["basic::bfcl"],
        ),
        BenchmarkInput(
            benchmark_id="meta-reference-ifeval",
            dataset_id="ifeval",
--- a/llama_stack/distributions/open-benchmark/run.yaml
+++ b/llama_stack/distributions/open-benchmark/run.yaml
@ -136,14 +136,14 @@ inference_store:
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/open-benchmark}/inference_store.db
 models:
 - metadata: {}
-  model_id: openai/gpt-4o
+  model_id: gpt-4o
  provider_id: openai
-  provider_model_id: openai/gpt-4o
+  provider_model_id: gpt-4o
  model_type: llm
 - metadata: {}
-  model_id: anthropic/claude-3-5-sonnet-latest
+  model_id: claude-3-5-sonnet-latest
  provider_id: anthropic
-  provider_model_id: anthropic/claude-3-5-sonnet-latest
+  provider_model_id: claude-3-5-sonnet-latest
  model_type: llm
 - metadata: {}
  model_id: gemini/gemini-1.5-flash
@ -188,12 +188,6 @@ datasets:
    uri: huggingface://datasets/llamastack/math_500?split=test
  metadata: {}
  dataset_id: math_500
 - purpose: eval/messages-answer
  source:
    type: uri
    uri: huggingface://datasets/llamastack/bfcl_v3?split=train
  metadata: {}
  dataset_id: bfcl
 - purpose: eval/messages-answer
  source:
    type: uri
@ -228,11 +222,6 @@ benchmarks:
  - basic::regex_parser_math_response
  metadata: {}
  benchmark_id: meta-reference-math-500
 - dataset_id: bfcl
  scoring_functions:
  - basic::bfcl
  metadata: {}
  benchmark_id: meta-reference-bfcl
 - dataset_id: ifeval
  scoring_functions:
  - basic::ifeval
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -89,28 +89,28 @@ providers:
    config:
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/faiss_store.db
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec_registry.db
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/milvus_registry.db
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu/}/chroma_remote_registry.db
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
@ -121,15 +121,15 @@ providers:
      password: ${env.PGVECTOR_PASSWORD:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/pgvector_registry.db
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
      metadata_store:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/llama_stack/distributions/starter-gpu/starter_gpu.py
+++ b/llama_stack/distributions/starter-gpu/starter_gpu.py
@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
 def get_distribution_template() -> DistributionTemplate:
-    template = get_starter_distribution_template()
+    template = get_starter_distribution_template(name="starter-gpu")
    name = "starter-gpu"
    template.name = name
    template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments."
    template.providers["post_training"] = [
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@ -99,9 +99,8 @@ def get_remote_inference_providers() -> list[Provider]:
    return inference_providers
-def get_distribution_template() -> DistributionTemplate:
+def get_distribution_template(name: str = "starter") -> DistributionTemplate:
    remote_inference_providers = get_remote_inference_providers()
    name = "starter"
    providers = {
        "inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in remote_inference_providers]
--- a/llama_stack/providers/inline/batches/reference/batches.py
+++ b/llama_stack/providers/inline/batches/reference/batches.py
@ -178,9 +178,9 @@ class ReferenceBatchesImpl(Batches):
        # TODO: set expiration time for garbage collection
-        if endpoint not in ["/v1/chat/completions"]:
+        if endpoint not in ["/v1/chat/completions", "/v1/completions"]:
            raise ValueError(
-                f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions. Code: invalid_value. Param: endpoint",
+                f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions, /v1/completions. Code: invalid_value. Param: endpoint",
            )
        if completion_window != "24h":
@ -424,13 +424,21 @@ class ReferenceBatchesImpl(Batches):
                            )
                            valid = False
-                        for param, expected_type, type_string in [
+                        if batch.endpoint == "/v1/chat/completions":
-                            ("model", str, "a string"),
+                            required_params = [
-                            # messages is specific to /v1/chat/completions
+                                ("model", str, "a string"),
-                            # we could skip validating messages here and let inference fail. however,
+                                # messages is specific to /v1/chat/completions
-                            # that would be a very expensive way to find out messages is wrong.
+                                # we could skip validating messages here and let inference fail. however,
-                            ("messages", list, "an array"),  # TODO: allow messages to be a string?
+                                # that would be a very expensive way to find out messages is wrong.
-                        ]:
+                                ("messages", list, "an array"),  # TODO: allow messages to be a string?
                            ]
                        else:  # /v1/completions
                            required_params = [
                                ("model", str, "a string"),
                                ("prompt", str, "a string"),  # TODO: allow prompt to be a list of strings??
                            ]
                        for param, expected_type, type_string in required_params:
                            if param not in body:
                                errors.append(
                                    BatchError(
@ -591,20 +599,37 @@ class ReferenceBatchesImpl(Batches):
        try:
            # TODO(SECURITY): review body for security issues
-            request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]]
+            if request.url == "/v1/chat/completions":
-            chat_response = await self.inference_api.openai_chat_completion(**request.body)
+                request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]]
                chat_response = await self.inference_api.openai_chat_completion(**request.body)
-            # this is for mypy, we don't allow streaming so we'll get the right type
+                # this is for mypy, we don't allow streaming so we'll get the right type
-            assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method"
+                assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method"
-            return {
+                return {
-                "id": request_id,
+                    "id": request_id,
-                "custom_id": request.custom_id,
+                    "custom_id": request.custom_id,
-                "response": {
+                    "response": {
-                    "status_code": 200,
+                        "status_code": 200,
-                    "request_id": request_id,  # TODO: should this be different?
+                        "request_id": request_id,  # TODO: should this be different?
-                    "body": chat_response.model_dump_json(),
+                        "body": chat_response.model_dump_json(),
-                },
+                    },
-            }
+                }
            else:  # /v1/completions
                completion_response = await self.inference_api.openai_completion(**request.body)
                # this is for mypy, we don't allow streaming so we'll get the right type
                assert hasattr(completion_response, "model_dump_json"), (
                    "Completion response must have model_dump_json method"
                )
                return {
                    "id": request_id,
                    "custom_id": request.custom_id,
                    "response": {
                        "status_code": 200,
                        "request_id": request_id,
                        "body": completion_response.model_dump_json(),
                    },
                }
        except Exception as e:
            logger.info(f"Error processing request {request.custom_id} in batch {batch_id}: {e}")
            return {
--- a/llama_stack/providers/inline/files/localfs/files.py
+++ b/llama_stack/providers/inline/files/localfs/files.py
@ -86,11 +86,16 @@ class LocalfsFilesImpl(Files):
        self,
        file: Annotated[UploadFile, File()],
        purpose: Annotated[OpenAIFilePurpose, Form()],
        expires_after_anchor: Annotated[str | None, Form(alias="expires_after[anchor]")] = None,
        expires_after_seconds: Annotated[int | None, Form(alias="expires_after[seconds]")] = None,
    ) -> OpenAIFileObject:
        """Upload a file that can be used across various endpoints."""
        if not self.sql_store:
            raise RuntimeError("Files provider not initialized")
        if expires_after_anchor is not None or expires_after_seconds is not None:
            raise NotImplementedError("File expiration is not supported by this provider")
        file_id = self._generate_file_id()
        file_path = self._get_file_path(file_id)
--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@ -22,7 +22,6 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 )
 from .config import BasicScoringConfig
 from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
 from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
 from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
@ -37,7 +36,6 @@ FIXED_FNS = [
    SubsetOfScoringFn,
    RegexParserScoringFn,
    RegexParserMathResponseScoringFn,
    BFCLScoringFn,
    IfEvalScoringFn,
    DocVQAScoringFn,
 ]
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
@ -1,93 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 import re
 from typing import Any
 from llama_stack.apis.scoring import ScoringResultRow
 from llama_stack.apis.scoring_functions import ScoringFnParams
 from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
 from ..utils.bfcl.ast_parser import decode_ast
 from ..utils.bfcl.checker import ast_checker, is_empty_output
 from .fn_defs.bfcl import bfcl
 def postprocess(x: dict[str, Any], test_category: str) -> dict[str, Any]:
    contain_func_call = False
    error = None
    error_type = None
    checker_result = {}
    try:
        prediction = decode_ast(x["generated_answer"], x["language"]) or ""
        contain_func_call = True
        # if not is_function_calling_format_output(prediction):
        if is_empty_output(prediction):
            contain_func_call = False
            error = "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
            error_type = "ast_decoder:decoder_wrong_output_format"
        else:
            checker_result = ast_checker(
                json.loads(x["function"]),
                prediction,
                json.loads(x["ground_truth"]),
                x["language"],
                test_category=test_category,
                model_name="",
            )
    except Exception as e:
        prediction = ""
        error = f"Invalid syntax. Failed to decode AST. {str(e)}"
        error_type = "ast_decoder:decoder_failed"
    return {
        "prediction": prediction,
        "contain_func_call": contain_func_call,
        "valid": checker_result.get("valid", False),
        "error": error or checker_result.get("error", ""),
        "error_type": error_type or checker_result.get("error_type", ""),
    }
 def gen_valid(x: dict[str, Any]) -> dict[str, float]:
    return {"valid": x["valid"]}
 def gen_relevance_acc(x: dict[str, Any]) -> dict[str, float]:
    # This function serves for both relevance and irrelevance tests, which share the exact opposite logic.
    # If `test_category` is "irrelevance", the model is expected to output no function call.
    # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`).
    # If `test_category` is "relevance", the model is expected to output to a function call, and empty list doesn't count as a function call.
    acc = not x["contain_func_call"] if "irrelevance" in x["id"] else x["contain_func_call"]
    return {"valid": float(acc)}
 class BFCLScoringFn(RegisteredBaseScoringFn):
    """
    A scoring_fn for BFCL
    """
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.supported_fn_defs_registry = {
            bfcl.identifier: bfcl,
        }
    async def score_row(
        self,
        input_row: dict[str, Any],
        scoring_fn_identifier: str | None = "bfcl",
        scoring_params: ScoringFnParams | None = None,
    ) -> ScoringResultRow:
        test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"])
        score_result = postprocess(input_row, test_category)
        if test_category in {"irrelevance", "live_relevance", "live_irrelevance"}:
            score = gen_relevance_acc(score_result)["valid"]
        else:
            score = gen_valid(score_result)["valid"]
        return {
            "score": float(score),
        }
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
@ -1,21 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.common.type_system import NumberType
 from llama_stack.apis.scoring_functions import (
    AggregationFunctionType,
    BasicScoringFnParams,
    ScoringFn,
 )
 bfcl = ScoringFn(
    identifier="basic::bfcl",
    description="BFCL complex scoring",
    return_type=NumberType(),
    provider_id="basic",
    provider_resource_id="bfcl",
    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
 )
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
@ -1,296 +0,0 @@
 # ruff: noqa
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import ast
 from .tree_sitter import get_parser
 def parse_java_function_call(source_code):
    if not source_code.endswith(";"):
        source_code += ";"  # Necessary for the parser not to register an error
    parser = get_parser("java")
    tree = parser.parse(bytes(source_code, "utf8"))
    root_node = tree.root_node
    if root_node.has_error:
        raise Exception("Error parsing java the source code.")
    def get_text(node):
        """Returns the text represented by the node."""
        return source_code[node.start_byte : node.end_byte]
    def traverse_node(node, nested=False):
        if node.type == "string_literal":
            if nested:
                return get_text(node)
            # Strip surrounding quotes from string literals
            return get_text(node)[1:-1]
        elif node.type == "character_literal":
            if nested:
                return get_text(node)
            # Strip surrounding single quotes from character literals
            return get_text(node)[1:-1]
        """Traverse the node to collect texts for complex structures."""
        if node.type in [
            "identifier",
            "class_literal",
            "type_identifier",
            "method_invocation",
        ]:
            return get_text(node)
        elif node.type == "array_creation_expression":
            # Handle array creation expression specifically
            type_node = node.child_by_field_name("type")
            value_node = node.child_by_field_name("value")
            type_text = traverse_node(type_node, True)
            value_text = traverse_node(value_node, True)
            return f"new {type_text}[]{value_text}"
        elif node.type == "object_creation_expression":
            # Handle object creation expression specifically
            type_node = node.child_by_field_name("type")
            arguments_node = node.child_by_field_name("arguments")
            type_text = traverse_node(type_node, True)
            if arguments_node:
                # Process each argument carefully, avoiding unnecessary punctuation
                argument_texts = []
                for child in arguments_node.children:
                    if child.type not in [
                        ",",
                        "(",
                        ")",
                    ]:  # Exclude commas and parentheses
                        argument_text = traverse_node(child, True)
                        argument_texts.append(argument_text)
                arguments_text = ", ".join(argument_texts)
                return f"new {type_text}({arguments_text})"
            else:
                return f"new {type_text}()"
        elif node.type == "set":
            # Handling sets specifically
            items = [traverse_node(n, True) for n in node.children if n.type not in [",", "set"]]
            return "{" + ", ".join(items) + "}"
        elif node.child_count > 0:
            return "".join(traverse_node(child, True) for child in node.children)
        else:
            return get_text(node)
    def extract_arguments(args_node):
        arguments = {}
        for child in args_node.children:
            if child.type == "assignment_expression":
                # For named parameters
                name_node, value_node = child.children[0], child.children[2]
                name = get_text(name_node)
                value = traverse_node(value_node)
                if name in arguments:
                    if not isinstance(arguments[name], list):
                        arguments[name] = [arguments[name]]
                    arguments[name].append(value)
                else:
                    arguments[name] = value
                # arguments.append({'name': name, 'value': value})
            elif child.type in ["identifier", "class_literal", "set"]:
                # For unnamed parameters and handling sets
                value = traverse_node(child)
                if None in arguments:
                    if not isinstance(arguments[None], list):
                        arguments[None] = [arguments[None]]
                    arguments[None].append(value)
                else:
                    arguments[None] = value
        return arguments
    def traverse(node):
        if node.type == "method_invocation":
            # Extract the function name and its arguments
            method_name = get_text(node.child_by_field_name("name"))
            class_name_node = node.child_by_field_name("object")
            if class_name_node:
                class_name = get_text(class_name_node)
                function_name = f"{class_name}.{method_name}"
            else:
                function_name = method_name
            arguments_node = node.child_by_field_name("arguments")
            if arguments_node:
                arguments = extract_arguments(arguments_node)
                for key, value in arguments.items():
                    if isinstance(value, list):
                        raise Exception("Error: Multiple arguments with the same name are not supported.")
                return [{function_name: arguments}]
        else:
            for child in node.children:
                result = traverse(child)
                if result:
                    return result
    result = traverse(root_node)
    return result if result else {}
 def parse_javascript_function_call(source_code):
    if not source_code.endswith(";"):
        source_code += ";"  # Necessary for the parser not to register an error
    parser = get_parser("javascript")
    # Parse the source code
    tree = parser.parse(bytes(source_code, "utf8"))
    root_node = tree.root_node
    if root_node.has_error:
        raise Exception("Error js parsing the source code.")
    # Function to recursively extract argument details
    def extract_arguments(node):
        args = {}
        for child in node.children:
            if child.type == "assignment_expression":
                # Extract left (name) and right (value) parts of the assignment
                name = child.children[0].text.decode("utf-8")
                value = child.children[2].text.decode("utf-8")
                if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
                    value = value[1:-1]  # Trim the quotation marks
                if name in args:
                    if not isinstance(args[name], list):
                        args[name] = [args[name]]
                    args[name].append(value)
                else:
                    args[name] = value
            elif child.type == "identifier" or child.type == "true":
                # Handle non-named arguments and boolean values
                value = child.text.decode("utf-8")
                if None in args:
                    if not isinstance(args[None], list):
                        args[None] = [args[None]]
                    args[None].append(value)
                else:
                    args[None] = value
        return args
    # Find the function call and extract its name and arguments
    if root_node.type == "program":
        for child in root_node.children:
            if child.type == "expression_statement":
                for sub_child in child.children:
                    if sub_child.type == "call_expression":
                        function_name = sub_child.children[0].text.decode("utf8")
                        arguments_node = sub_child.children[1]
                        parameters = extract_arguments(arguments_node)
                        for key, value in parameters.items():
                            if isinstance(value, list):
                                raise Exception("Error: Multiple arguments with the same name are not supported.")
                        result = [{function_name: parameters}]
                        return result
 def ast_parse(input_str, language="Python"):
    if language == "Python":
        cleaned_input = input_str.strip("[]'")
        parsed = ast.parse(cleaned_input, mode="eval")
        extracted = []
        if isinstance(parsed.body, ast.Call):
            extracted.append(resolve_ast_call(parsed.body))
        else:
            for elem in parsed.body.elts:
                extracted.append(resolve_ast_call(elem))
        return extracted
    elif language == "Java":
        return parse_java_function_call(input_str[1:-1])  # Remove the [ and ] from the string
    elif language == "JavaScript":
        return parse_javascript_function_call(input_str[1:-1])
    else:
        raise NotImplementedError(f"Unsupported language: {language}")
 def resolve_ast_call(elem):
    # Handle nested attributes for deeply nested module paths
    func_parts = []
    func_part = elem.func
    while isinstance(func_part, ast.Attribute):
        func_parts.append(func_part.attr)
        func_part = func_part.value
    if isinstance(func_part, ast.Name):
        func_parts.append(func_part.id)
    func_name = ".".join(reversed(func_parts))
    args_dict = {}
    # Parse when args are simply passed as an unnamed dictionary arg
    for arg in elem.args:
        if isinstance(arg, ast.Dict):
            for key, value in zip(arg.keys, arg.values):
                if isinstance(key, ast.Constant):
                    arg_name = key.value
                output = resolve_ast_by_type(value)
                args_dict[arg_name] = output
    for arg in elem.keywords:
        output = resolve_ast_by_type(arg.value)
        args_dict[arg.arg] = output
    return {func_name: args_dict}
 def resolve_ast_by_type(value):
    if isinstance(value, ast.Constant):
        if value.value is Ellipsis:
            output = "..."
        else:
            output = value.value
    elif isinstance(value, ast.UnaryOp):
        output = -value.operand.value
    elif isinstance(value, ast.List):
        output = [resolve_ast_by_type(v) for v in value.elts]
    elif isinstance(value, ast.Dict):
        output = {resolve_ast_by_type(k): resolve_ast_by_type(v) for k, v in zip(value.keys, value.values)}
    elif isinstance(value, ast.NameConstant):  # Added this condition to handle boolean values
        output = value.value
    elif isinstance(value, ast.BinOp):  # Added this condition to handle function calls as arguments
        output = eval(ast.unparse(value))
    elif isinstance(value, ast.Name):
        output = value.id
    elif isinstance(value, ast.Call):
        if len(value.keywords) == 0:
            output = ast.unparse(value)
        else:
            output = resolve_ast_call(value)
    elif isinstance(value, ast.Tuple):
        output = tuple(resolve_ast_by_type(v) for v in value.elts)
    elif isinstance(value, ast.Lambda):
        output = eval(ast.unparse(value.body[0].value))
    elif isinstance(value, ast.Ellipsis):
        output = "..."
    elif isinstance(value, ast.Subscript):
        try:
            output = ast.unparse(value.body[0].value)
        except:
            output = ast.unparse(value.value) + "[" + ast.unparse(value.slice) + "]"
    else:
        raise Exception(f"Unsupported AST type: {type(value)}")
    return output
 def decode_ast(result, language="Python"):
    func = result
    func = func.replace("\n", "")  # remove new line characters
    if not func.startswith("["):
        func = "[" + func
    if not func.endswith("]"):
        func = func + "]"
    decoded_output = ast_parse(func, language)
    return decoded_output
 def decode_execute(result):
    func = result
    func = func.replace("\n", "")  # remove new line characters
    if not func.startswith("["):
        func = "[" + func
    if not func.endswith("]"):
        func = func + "]"
    decode_output = ast_parse(func)
    execution_list = []
    for function_call in decode_output:
        for key, value in function_call.items():
            execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})")
    return execution_list
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
@ -1,989 +0,0 @@
 # ruff: noqa
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 import re
 import time
 from typing import Any
 # Comment out for now until we actually use the rest checker in evals
 # import requests  # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function.
 class NoAPIKeyError(Exception):
    def __init__(self):
        self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
        super().__init__(self.message)
 REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2
 JAVA_TYPE_CONVERSION = {
    "byte": int,
    "short": int,
    "integer": int,
    "float": float,
    "double": float,
    "long": int,
    "boolean": bool,
    "char": str,
    "Array": list,
    "ArrayList": list,
    "Set": set,
    "HashMap": dict,
    "Hashtable": dict,
    "Queue": list,  # this can be `queue.Queue` as well, for simplicity we check with list
    "Stack": list,
    "String": str,
    "any": str,
 }
 JS_TYPE_CONVERSION = {
    "String": str,
    "integer": int,
    "float": float,
    "Bigint": int,
    "Boolean": bool,
    "dict": dict,
    "array": list,
    "any": str,
 }
 # We switch to conditional import for the following two imports to avoid unnecessary installations.
 # User doesn't need to setup the tree-sitter packages if they are not running the test for that language.
 # from js_type_converter import js_type_converter
 # from java_type_converter import java_type_converter
 PYTHON_TYPE_MAPPING = {
    "string": str,
    "integer": int,
    "float": float,
    "boolean": bool,
    "array": list,
    "tuple": list,
    "dict": dict,
    "any": str,
 }
 # This is the list of types that we need to recursively check its values
 PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"]
 NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"]
 #### Helper functions for AST ####
 def find_description(func_descriptions, name):
    if type(func_descriptions) == list:
        for func_description in func_descriptions:
            if func_description["name"] == name:
                return func_description
        return None
    else:
        # it is a dict, there is only one function
        return func_descriptions
 def get_possible_answer_type(possible_answer: list):
    for answer in possible_answer:
        if answer != "":  # Optional parameter
            return type(answer)
    return None
 def type_checker(
    param: str,
    value,
    possible_answer: list,
    expected_type_description: str,
    expected_type_converted,
    nested_type_converted,
 ):
    # NOTE: This type checker only supports nested type checking for one level deep.
    # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex.
    result: Any = {
        "valid": True,
        "error": [],
        "is_variable": False,
        "error_type": "type_error:simple",
    }
    is_variable = False
    # check for the case where a variable is used instead of a actual value.
    # use the type in possible_answer as the expected type
    possible_answer_type = get_possible_answer_type(possible_answer)
    # if possible_answer only contains optional parameters, we can't determine the type
    if possible_answer_type != None:
        # we are being precise here.
        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
        if possible_answer_type != expected_type_converted:
            is_variable = True
    # value is the same type as in function description
    if type(value) == expected_type_converted:
        # We don't need to do recursive check for simple types
        if nested_type_converted == None:
            result["is_variable"] = is_variable
            return result
        else:
            for possible_answer_item in possible_answer:
                flag = True  # Each parameter should match to at least one possible answer type.
                # Here, we assume that each item should be the same type. We could also relax it.
                if type(possible_answer_item) == list:
                    for value_item in value:
                        checker_result = type_checker(
                            param,
                            value_item,
                            possible_answer_item,
                            str(nested_type_converted),
                            nested_type_converted,
                            None,
                        )
                        if not checker_result["valid"]:
                            flag = False
                            break
                if flag:
                    return {"valid": True, "error": [], "is_variable": is_variable}
            result["valid"] = False
            result["error"] = [
                f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}."
            ]
            result["error_type"] = "type_error:nested"
    # value is not as expected, check for the case where a variable is used instead of a actual value
    # use the type in possible_answer as the expected type
    possible_answer_type = get_possible_answer_type(possible_answer)
    # if possible_answer only contains optional parameters, we can't determine the type
    if possible_answer_type != None:
        # we are being precise here.
        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
        if type(value) == possible_answer_type:
            result["is_variable"] = True
            return result
    result["valid"] = False
    result["error"].append(
        f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}."
    )
    result["error_type"] = "type_error:simple"
    return result
 def standardize_string(input_string: str):
    # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase
    # It will also convert all the single quotes to double quotes
    # This is used to compare the model output with the possible answers
    # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024
    regex_string = r"[ \,\.\/\-\_\*\^]"
    return re.sub(regex_string, "", input_string).lower().replace("'", '"')
 def string_checker(param: str, model_output: str, possible_answer: list):
    standardize_possible_answer = []
    standardize_model_output = standardize_string(model_output)
    for i in range(len(possible_answer)):
        if type(possible_answer[i]) == str:
            standardize_possible_answer.append(standardize_string(possible_answer[i]))
    if standardize_model_output not in standardize_possible_answer:
        return {
            "valid": False,
            "error": [
                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive."
            ],
            "error_type": "value_error:string",
        }
    return {"valid": True, "error": []}
 def list_checker(param: str, model_output: list, possible_answer: list):
    # Convert the tuple to a list
    standardize_model_output = list(model_output)
    # If the element in the list is a string, we need to standardize it
    for i in range(len(standardize_model_output)):
        if type(standardize_model_output[i]) == str:
            standardize_model_output[i] = standardize_string(model_output[i])
    standardize_possible_answer: Any = []
    # We also need to standardize the possible answers
    for i in range(len(possible_answer)):
        standardize_possible_answer.append([])
        for j in range(len(possible_answer[i])):
            if type(possible_answer[i][j]) == str:
                standardize_possible_answer[i].append(standardize_string(possible_answer[i][j]))
            else:
                standardize_possible_answer[i].append(possible_answer[i][j])
    if standardize_model_output not in standardize_possible_answer:
        return {
            "valid": False,
            "error": [
                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}."
            ],
            "error_type": "value_error:list/tuple",
        }
    return {"valid": True, "error": []}
 def dict_checker(param: str, model_output: dict, possible_answers: list):
    # This function works for simple dictionaries, but not dictionaries with nested dictionaries.
    # The current dataset only contains simple dictionaries, so this is sufficient.
    result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
    for i in range(len(possible_answers)):
        if possible_answers[i] == "":
            continue
        result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
        flag = True
        possible_answer = possible_answers[i]
        # possible_anwer is a single dictionary
        for key, value in model_output.items():
            if key not in possible_answer:
                result["valid"] = False
                result["error"].append(f"Unexpected dict key parameter: '{key}'.")  # type: ignore[attr-defined]
                result["error_type"] = "value_error:dict_key"
                flag = False
                break
            standardize_value = value
            # If the value is a string, we need to standardize it
            if type(value) == str:
                standardize_value = standardize_string(value)
            # We also need to standardize the possible answers if they are string
            standardize_possible_answer = []
            for i in range(len(possible_answer[key])):
                if type(possible_answer[key][i]) == str:
                    standardize_possible_answer.append(standardize_string(possible_answer[key][i]))
                else:
                    standardize_possible_answer.append(possible_answer[key][i])
            if standardize_value not in standardize_possible_answer:
                result["valid"] = False
                result["error"].append(  # type: ignore[attr-defined]
                    f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}."
                )
                result["error_type"] = "value_error:dict_value"
                flag = False
                break
        for key, value in possible_answer.items():
            if key not in model_output and "" not in value:
                result["valid"] = False
                result["error"].append(f"Missing dict key parameter: '{key}'.")  # type: ignore[attr-defined]
                result["error_type"] = "value_error:dict_key"
                flag = False
                break
        if flag:
            return {"valid": True, "error": []}
    return result
 def list_dict_checker(param: str, model_output: list, possible_answers: list):
    # This function takes in a list of dictionaries and checks if each dictionary is valid
    # The order of the dictionaries in the list must match the order of the possible answers
    result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"}
    for answer_index in range(len(possible_answers)):
        flag = True  # True means so far, all dictionaries are valid
        # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers
        if len(model_output) != len(possible_answers[answer_index]):
            result["valid"] = False
            result["error"] = ["Wrong number of dictionaries in the list."]
            result["error_type"] = "value_error:list_dict_count"
            flag = False
            continue
        for dict_index in range(len(model_output)):
            result = dict_checker(
                param,
                model_output[dict_index],
                [possible_answers[answer_index][dict_index]],
            )
            if not result["valid"]:
                flag = False
                break
        if flag:
            return {"valid": True, "error": []}
    return result
 def simple_function_checker(
    func_description: dict,
    model_output: dict,
    possible_answer: dict,
    language: str,
    model_name: str,
 ):
    possible_answer = list(possible_answer.values())[0]
    # Extract function name and parameters details
    func_name = func_description["name"]
    param_details = func_description["parameters"]["properties"]
    required_params = func_description["parameters"]["required"]
    # Initialize a result dictionary
    result = {
        "valid": True,
        "error": [],
        "error_type": "simple_function_checker:unclear",
    }
    # Check if function name matches
    if func_name not in model_output:
        result["valid"] = False
        result["error"].append(  # type: ignore[attr-defined]
            f"Function name {repr(func_name)} not found in model output."
        )
        result["error_type"] = "simple_function_checker:wrong_func_name"
        return result
    model_params = model_output[func_name]
    # Check for required parameters in model output
    for param in required_params:
        if param not in model_params:
            result["valid"] = False
            result["error"].append(f"Missing required parameter: {repr(param)}.")  # type: ignore[attr-defined]
            result["error_type"] = "simple_function_checker:missing_required"
            return result
    # Validate types and values for each parameter in model output
    for param, value in model_params.items():
        if param not in param_details or param not in possible_answer:
            result["valid"] = False
            result["error"].append(f"Unexpected parameter: {repr(param)}.")  # type: ignore[attr-defined]
            result["error_type"] = "simple_function_checker:unexpected_param"
            return result
        full_param_details = param_details[param]
        expected_type_description = full_param_details["type"]  # This is a string
        is_variable = False
        nested_type_converted = None
        if language == "Java":
            from evals.utils.bfcl.java_type_converter import java_type_converter
            expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description]
            if expected_type_description in JAVA_TYPE_CONVERSION:
                if type(value) != str:
                    result["valid"] = False
                    result["error"].append(  # type: ignore[attr-defined]
                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
                    )
                    result["error_type"] = "type_error:java"
                    return result
                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
                    nested_type = param_details[param]["items"]["type"]
                    nested_type_converted = JAVA_TYPE_CONVERSION[nested_type]
                    value = java_type_converter(value, expected_type_description, nested_type)
                else:
                    value = java_type_converter(value, expected_type_description)
        elif language == "JavaScript":
            from evals.utils.bfcl.js_type_converter import js_type_converter
            expected_type_converted = JS_TYPE_CONVERSION[expected_type_description]
            if expected_type_description in JS_TYPE_CONVERSION:
                if type(value) != str:
                    result["valid"] = False
                    result["error"].append(  # type: ignore[attr-defined]
                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
                    )
                    result["error_type"] = "type_error:js"
                    return result
                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
                    nested_type = param_details[param]["items"]["type"]
                    nested_type_converted = JS_TYPE_CONVERSION[nested_type]
                    value = js_type_converter(value, expected_type_description, nested_type)
                else:
                    value = js_type_converter(value, expected_type_description)
        elif language == "Python":
            expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description]
            if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST:
                nested_type = param_details[param]["items"]["type"]
                nested_type_converted = PYTHON_TYPE_MAPPING[nested_type]
        # We convert all tuple value to list when the expected type is tuple.
        # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load().
        # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future.
        if expected_type_description == "tuple" and type(value) == tuple:
            value = list(value)
        # Allow python auto conversion from int to float
        if language == "Python" and expected_type_description == "float" and type(value) == int:
            value = float(value)
        # Type checking
        # In fact, we only check for Python here.
        # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct.
        type_check_result = type_checker(
            param,
            value,
            possible_answer[param],
            expected_type_description,
            expected_type_converted,
            nested_type_converted,
        )
        is_variable = type_check_result["is_variable"]
        if not type_check_result["valid"]:
            return type_check_result
        # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable.
        # We can just treat the variable as a string and use the normal flow.
        if not is_variable:
            # Special handle for dictionaries
            if expected_type_converted == dict:
                result = dict_checker(param, value, possible_answer[param])
                if not result["valid"]:
                    return result
                continue
            # Special handle for list of dictionaries
            elif expected_type_converted == list and nested_type_converted == dict:
                result = list_dict_checker(param, value, possible_answer[param])
                if not result["valid"]:
                    return result
                continue
            # Special handle for strings
            elif expected_type_converted == str:
                # We don't check for case sensitivity for string, as long as it's not a variable
                result = string_checker(param, value, possible_answer[param])
                if not result["valid"]:
                    return result
                continue
            elif expected_type_converted == list:
                result = list_checker(param, value, possible_answer[param])
                if not result["valid"]:
                    return result
                continue
        # Check if the value is within the possible answers
        if value not in possible_answer[param]:
            result["valid"] = False
            result["error"].append(  # type: ignore[attr-defined]
                f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}."
            )
            result["error_type"] = "value_error:others"
            return result
    # Check for optional parameters not provided but allowed
    for param in possible_answer:
        if param not in model_params and "" not in possible_answer[param]:
            result["valid"] = False
            result["error"].append(  # type: ignore[attr-defined]
                f"Optional parameter {repr(param)} not provided and not marked as optional."
            )
            result["error_type"] = "simple_function_checker:missing_optional"
            return result
    return result
 def parallel_function_checker_enforce_order(
    func_descriptions: list,
    model_output: list,
    possible_answers: dict,
    language: str,
    model_name: str,
 ):
    if len(model_output) != len(possible_answers):
        return {
            "valid": False,
            "error": ["Wrong number of functions."],
            "error_type": "parallel_function_checker_enforce_order:wrong_count",
        }
    func_name_list = list(possible_answers.keys())
    possible_answers_list = []
    for key, value in possible_answers.items():
        possible_answers_list.append({key: value})
    for i in range(len(possible_answers_list)):
        func_description = find_description(func_descriptions, func_name_list[i])
        result = simple_function_checker(
            func_description,
            model_output[i],
            possible_answers_list[i],
            language,
            model_name,
        )
        if not result["valid"]:
            return result
    return {"valid": True, "error": []}
 def parallel_function_checker_no_order(
    func_descriptions: list,
    model_output: list,
    possible_answers: list,
    language: str,
    model_name: str,
 ):
    if len(model_output) != len(possible_answers):
        return {
            "valid": False,
            "error": ["Wrong number of functions."],
            "error_type": "parallel_function_checker_no_order:wrong_count",
        }
    matched_indices = []
    # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer
    # It must be this way because we need ground truth to fetch the correct function description
    for i in range(len(possible_answers)):
        # possible_answers[i] is a dictionary with only one key
        func_name_expected = list(possible_answers[i].keys())[0]
        func_description = find_description(func_descriptions, func_name_expected)
        all_errors = []
        for index in range(len(model_output)):
            if index in matched_indices:
                continue
            result = simple_function_checker(
                func_description,
                model_output[index],
                possible_answers[i],
                language,
                model_name,
            )
            if result["valid"]:
                matched_indices.append(index)
                break
            else:
                all_errors.append(
                    {
                        f"Model Result Index {index}": {
                            "sub_error": result["error"],
                            "sub_error_type": result["error_type"],
                            "model_output_item": model_output[index],
                            "possible_answer_item": possible_answers[i],
                        }
                    }
                )
        if not result["valid"]:
            considered_indices = [i for i in range(len(model_output)) if i not in matched_indices]
            all_errors.insert(
                0,
                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
            )
            return {
                "valid": False,
                "error": all_errors,
                "error_type": "parallel_function_checker_no_order:cannot_find_match",
            }
    return {"valid": True, "error": []}
 def multiple_function_checker(
    func_descriptions: list,
    model_output: list,
    possible_answers: list,
    language: str,
    model_name: str,
 ):
    if len(model_output) != len(possible_answers):
        return {
            "valid": False,
            "error": ["Wrong number of functions."],
            "error_type": "multiple_function_checker:wrong_count",
        }
    # possible_answers is a list of only one dictionary with only one key
    func_name_expected = list(possible_answers[0].keys())[0]
    func_description = find_description(func_descriptions, func_name_expected)
    return simple_function_checker(
        func_description,
        model_output[0],
        possible_answers[0],
        language,
        model_name,
    )
 def patten_matcher(exec_output, expected_result, function_call, is_sanity_check):
    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
    if type(exec_output) != type(expected_result):
        return {
            "valid": False,
            "error": [
                f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}."
            ],
            "error_type": "executable_checker:wrong_result_type",
            "model_executed_output": exec_output,
        }
    if type(exec_output) == dict:
        # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one.
        # This happens when the key is a timestamp or a random number.
        if is_sanity_check:
            if len(exec_output) != len(expected_result):
                return {
                    "valid": False,
                    "error": [
                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
                    ],
                    "error_type": "executable_checker:wrong_result_type:dict_length",
                    "model_executed_output": exec_output,
                }
            else:
                return result
        for key, value in expected_result.items():
            if key not in exec_output:
                return {
                    "valid": False,
                    "error": [
                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output."
                    ],
                    "error_type": "executable_checker:wrong_result_type:dict_key_not_found",
                    "model_executed_output": exec_output,
                }
        for key, value in exec_output.items():
            if key not in expected_result:
                return {
                    "valid": False,
                    "error": [
                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output."
                    ],
                    "error_type": "executable_checker:wrong_result_type:dict_extra_key",
                    "model_executed_output": exec_output,
                }
    if type(exec_output) == list:
        if len(exec_output) != len(expected_result):
            return {
                "valid": False,
                "error": [
                    f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
                ],
                "error_type": "executable_checker:wrong_result_type:list_length",
                "model_executed_output": exec_output,
            }
    return result
 #### Helper functions for Exec ####
 def executable_checker_simple(
    function_call: str,
    expected_result,
    expected_result_type: str,
    is_sanity_check=False,
 ):
    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
    exec_dict: Any = {}
    try:
        exec(
            "from executable_python_function import *" + "\nresult=" + function_call,
            exec_dict,
        )
        exec_output = exec_dict["result"]
    except NoAPIKeyError as e:
        raise e
    except Exception as e:
        result["valid"] = False
        result["error"].append(  # type: ignore[attr-defined]
            f"Error in execution: {repr(function_call)}. Error: {str(e)}"
        )
        result["error_type"] = "executable_checker:execution_error"
        return result
    # We need to special handle the case where the execution result is a tuple and convert it to a list
    # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json
    if isinstance(exec_output, tuple):
        exec_output = list(exec_output)
    if expected_result_type == "exact_match":
        if exec_output != expected_result:
            result["valid"] = False
            result["error"].append(  # type: ignore[attr-defined]
                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}."
            )
            result["error_type"] = "executable_checker:wrong_result"
            result["model_executed_output"] = exec_output
            return result
    elif expected_result_type == "real_time_match":
        # Allow for 5% difference
        if (type(expected_result) == float or type(expected_result) == int) and (
            type(exec_output) == float or type(exec_output) == int
        ):
            if not (
                expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
                <= exec_output
                <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
            ):
                result["valid"] = False
                result["error"].append(  # type: ignore[attr-defined]
                    f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed."
                )
                result["error_type"] = "executable_checker:wrong_result_real_time"
                result["model_executed_output"] = exec_output
                return result
        else:
            result["valid"] = False
            result["error"].append(  # type: ignore[attr-defined]
                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria."
            )
            result["error_type"] = "executable_checker:wrong_result_real_time"
            result["model_executed_output"] = exec_output
            return result
    else:
        # structural match
        pattern_match_result = patten_matcher(exec_output, expected_result, function_call, is_sanity_check)
        if not pattern_match_result["valid"]:
            return pattern_match_result
    return result
 def executable_checker_parallel_no_order(
    decoded_result: list, expected_exec_result: list, expected_exec_result_type: list
 ):
    if len(decoded_result) != len(expected_exec_result):
        return {
            "valid": False,
            "error": [
                f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}."
            ],
            "error_type": "value_error:exec_result_count",
        }
    matched_indices = []
    for i in range(len(expected_exec_result)):
        all_errors = []
        for index in range(len(decoded_result)):
            if index in matched_indices:
                continue
            result = executable_checker_simple(
                decoded_result[index],
                expected_exec_result[i],
                expected_exec_result_type[i],
                False,
            )
            if result["valid"]:
                matched_indices.append(index)
                break
            else:
                all_errors.append(
                    {
                        f"Model Result Index {index}": {
                            "sub_error": result["error"],
                            "sub_error_type": result["error_type"],
                            "model_executed_output": (
                                result["model_executed_output"] if "model_executed_output" in result else None
                            ),
                        }
                    }
                )
        if not result["valid"]:
            considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices]
            all_errors.insert(
                0,
                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
            )
            return {
                "valid": False,
                "error": all_errors,
                "error_type": "executable_checker:cannot_find_match",
            }
    return {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
 #### Main function ####
 def executable_checker_rest(func_call, idx):
    # Move this here for now to avoid needing to read this file / fix paths to be relative to dataset_dir. Fix when it's actually needed / used.
    EVAL_GROUND_TRUTH_PATH = "/mnt/wsfuse/fair_llm_v2/datasets/eval/bfcl/rest-eval-response_v5.jsonl"  # Ground truth file for v5 for rest execution
    with open(EVAL_GROUND_TRUTH_PATH, "r") as f:
        EVAL_GROUND_TRUTH = f.readlines()
    if "https://geocode.maps.co" in func_call:
        time.sleep(2)
    if "requests_get" in func_call:
        func_call = func_call.replace("requests_get", "requests.get")
    try:
        response = eval(func_call)
    except Exception as e:
        return {
            "valid": False,
            "error": [f"Execution failed. {str(e)}"],
            "error_type": "executable_checker_rest:execution_error",
        }
    try:
        if response.status_code == 200:
            eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx])
            try:
                if isinstance(eval_GT_json, dict):
                    if isinstance(response.json(), dict):
                        if set(eval_GT_json.keys()) == set(response.json().keys()):
                            return {"valid": True, "error": [], "error_type": ""}
                        return {
                            "valid": False,
                            "error": ["Key inconsistency"],
                            "error_type": "executable_checker_rest:wrong_key",
                        }
                    return {
                        "valid": False,
                        "error": [f"Expected dictionary, but got {type(response.json())}"],
                        "error_type": "executable_checker_rest:wrong_type",
                    }
                elif isinstance(eval_GT_json, list):
                    if isinstance(response.json(), list):
                        if len(eval_GT_json) != len(response.json()):
                            return {
                                "valid": False,
                                "error": [f"Response list length inconsistency."],
                                "error_type": "value_error:exec_result_rest_count",
                            }
                        else:
                            for i in range(len(eval_GT_json)):
                                if set(eval_GT_json[i].keys()) != set(response.json()[i].keys()):
                                    return {
                                        "valid": False,
                                        "error": [f"Key inconsistency"],
                                        "error_type": "executable_checker_rest:wrong_key",
                                    }
                            return {"valid": True, "error": []}
                    else:
                        return {
                            "valid": False,
                            "error": [f"Expected list, but got {type(response.json())}"],
                            "error_type": "executable_checker_rest:wrong_type",
                        }
                return {
                    "valid": False,
                    "error": [f"Expected dict or list, but got {type(response.json())}"],
                    "error_type": "executable_checker_rest:wrong_type",
                }
            except Exception as e:
                return {
                    "valid": False,
                    "error": [
                        f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}"
                    ],
                    "error_type": "executable_checker_rest:response_format_error",
                }
        else:
            return {
                "valid": False,
                "error": [f"Execution result status code is not 200, got {response.status_code}"],
                "error_type": "executable_checker_rest:wrong_status_code",
            }
    except Exception as e:
        return {
            "valid": False,
            "error": [f"Cannot get status code of the response. Error: {str(e)}"],
            "error_type": "executable_checker_rest:cannot_get_status_code",
        }
 def ast_checker(func_description, model_output, possible_answer, language, test_category, model_name):
    if "parallel" in test_category:
        return parallel_function_checker_no_order(func_description, model_output, possible_answer, language, model_name)
    elif "multiple" in test_category:
        return multiple_function_checker(func_description, model_output, possible_answer, language, model_name)
    else:
        if len(model_output) != 1:
            return {
                "valid": False,
                "error": ["Wrong number of functions."],
                "error_type": "simple_function_checker:wrong_count",
            }
        return simple_function_checker(
            func_description[0],
            model_output[0],
            possible_answer[0],
            language,
            model_name,
        )
 def exec_checker(decoded_result: list, func_description: dict, test_category: str):
    if "multiple" in test_category or "parallel" in test_category:
        return executable_checker_parallel_no_order(
            decoded_result,
            func_description["execution_result"],
            func_description["execution_result_type"],
        )
    else:
        if len(decoded_result) != 1:
            return {
                "valid": False,
                "error": ["Wrong number of functions."],
                "error_type": "simple_exec_checker:wrong_count",
            }
        return executable_checker_simple(
            decoded_result[0],
            func_description["execution_result"][0],
            func_description["execution_result_type"][0],
            False,
        )
 def is_empty_output(decoded_output):
    # This function is a patch to the ast decoder for relevance detection
    # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call
    # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct)
    if not is_function_calling_format_output(decoded_output):
        return True
    if len(decoded_output) == 0:
        return True
    if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
        return True
 def is_function_calling_format_output(decoded_output):
    # Ensure the output is a list of dictionaries
    if type(decoded_output) == list:
        for item in decoded_output:
            if type(item) != dict:
                return False
        return True
    return False
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
@ -1,40 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 """
 Tree-sitter changes its API with unfortunate frequency. Modules that need it should
 import it from here so that we can centrally manage things as necessary.
 """
 # These currently work with tree-sitter 0.23.0
 # NOTE: Don't import tree-sitter or any of the language modules in the main module
 # because not all environments have them. Import lazily inside functions where needed.
 import importlib
 import typing
 if typing.TYPE_CHECKING:
    import tree_sitter
 def get_language(language: str) -> "tree_sitter.Language":
    import tree_sitter
    language_module_name = f"tree_sitter_{language}"
    try:
        language_module = importlib.import_module(language_module_name)
    except ModuleNotFoundError as exc:
        raise ValueError(
            f"Language {language} is not found. Please install the tree-sitter-{language} package."
        ) from exc
    return tree_sitter.Language(language_module.language())
 def get_parser(language: str, **kwargs) -> "tree_sitter.Parser":
    import tree_sitter
    lang = get_language(language)
    return tree_sitter.Parser(lang, **kwargs)
--- a/llama_stack/providers/inline/tool_runtime/rag/init.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/init.py
@ -14,6 +14,6 @@ from .config import RagToolRuntimeConfig
 async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]):
    from .memory import MemoryToolRuntimeImpl
-    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference])
+    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference], deps[Api.files])
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -5,10 +5,15 @@
 # the root directory of this source tree.
 import asyncio
 import base64
 import io
 import mimetypes
 import secrets
 import string
 from typing import Any
 import httpx
 from fastapi import UploadFile
 from pydantic import TypeAdapter
 from llama_stack.apis.common.content_types import (
@ -17,6 +22,7 @@ from llama_stack.apis.common.content_types import (
    InterleavedContentItem,
    TextContentItem,
 )
 from llama_stack.apis.files import Files, OpenAIFilePurpose
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.tools import (
    ListToolDefsResponse,
@ -30,13 +36,18 @@ from llama_stack.apis.tools import (
    ToolParameter,
    ToolRuntime,
 )
-from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO
+from llama_stack.apis.vector_io import (
    QueryChunksResponse,
    VectorIO,
    VectorStoreChunkingStrategyStatic,
    VectorStoreChunkingStrategyStaticConfig,
 )
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 from llama_stack.providers.utils.memory.vector_store import (
    content_from_doc,
-    make_overlapped_chunks,
+    parse_data_url,
 )
 from .config import RagToolRuntimeConfig
@ -55,10 +66,12 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        config: RagToolRuntimeConfig,
        vector_io_api: VectorIO,
        inference_api: Inference,
        files_api: Files,
    ):
        self.config = config
        self.vector_io_api = vector_io_api
        self.inference_api = inference_api
        self.files_api = files_api
    async def initialize(self):
        pass
@ -78,27 +91,50 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        vector_db_id: str,
        chunk_size_in_tokens: int = 512,
    ) -> None:
-        chunks = []
+        if not documents:
            return
        for doc in documents:
-            content = await content_from_doc(doc)
+            if isinstance(doc.content, URL):
-            # TODO: we should add enrichment here as URLs won't be added to the metadata by default
+                if doc.content.uri.startswith("data:"):
-            chunks.extend(
+                    parts = parse_data_url(doc.content.uri)
-                make_overlapped_chunks(
+                    file_data = base64.b64decode(parts["data"]) if parts["is_base64"] else parts["data"].encode()
-                    doc.document_id,
+                    mime_type = parts["mimetype"]
-                    content,
+                else:
-                    chunk_size_in_tokens,
+                    async with httpx.AsyncClient() as client:
-                    chunk_size_in_tokens // 4,
+                        response = await client.get(doc.content.uri)
-                    doc.metadata,
+                        file_data = response.content
                        mime_type = doc.mime_type or response.headers.get("content-type", "application/octet-stream")
            else:
                content_str = await content_from_doc(doc)
                file_data = content_str.encode("utf-8")
                mime_type = doc.mime_type or "text/plain"
            file_extension = mimetypes.guess_extension(mime_type) or ".txt"
            filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
            file_obj = io.BytesIO(file_data)
            file_obj.name = filename
            upload_file = UploadFile(file=file_obj, filename=filename)
            created_file = await self.files_api.openai_upload_file(
                file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
            )
            chunking_strategy = VectorStoreChunkingStrategyStatic(
                static=VectorStoreChunkingStrategyStaticConfig(
                    max_chunk_size_tokens=chunk_size_in_tokens,
                    chunk_overlap_tokens=chunk_size_in_tokens // 4,
                )
            )
-        if not chunks:
+            await self.vector_io_api.openai_attach_file_to_vector_store(
-            return
+                vector_store_id=vector_db_id,
-
+                file_id=created_file.id,
-        await self.vector_io_api.insert_chunks(
+                attributes=doc.metadata,
-            chunks=chunks,
+                chunking_strategy=chunking_strategy,
-            vector_db_id=vector_db_id,
+            )
        )
    async def query(
        self,
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -30,11 +30,11 @@ from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
 from llama_stack.providers.utils.memory.vector_store import (
    RERANKER_TYPE_RRF,
    RERANKER_TYPE_WEIGHTED,
    ChunkForDeletion,
    EmbeddingIndex,
    VectorDBWithIndex,
 )
 from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator
 logger = get_logger(name=__name__, category="vector_io")
@ -66,59 +66,6 @@ def _create_sqlite_connection(db_path):
    return connection
 def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
    """Normalize scores to [0,1] range using min-max normalization."""
    if not scores:
        return {}
    min_score = min(scores.values())
    max_score = max(scores.values())
    score_range = max_score - min_score
    if score_range > 0:
        return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
    return dict.fromkeys(scores, 1.0)
 def _weighted_rerank(
    vector_scores: dict[str, float],
    keyword_scores: dict[str, float],
    alpha: float = 0.5,
 ) -> dict[str, float]:
    """ReRanker that uses weighted average of scores."""
    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
    normalized_vector_scores = _normalize_scores(vector_scores)
    normalized_keyword_scores = _normalize_scores(keyword_scores)
    return {
        doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
        + ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
        for doc_id in all_ids
    }
 def _rrf_rerank(
    vector_scores: dict[str, float],
    keyword_scores: dict[str, float],
    impact_factor: float = 60.0,
 ) -> dict[str, float]:
    """ReRanker that uses Reciprocal Rank Fusion."""
    # Convert scores to ranks
    vector_ranks = {
        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
    }
    keyword_ranks = {
        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
    }
    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
    rrf_scores = {}
    for doc_id in all_ids:
        vector_rank = vector_ranks.get(doc_id, float("inf"))
        keyword_rank = keyword_ranks.get(doc_id, float("inf"))
        # RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
        rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
    return rrf_scores
 def _make_sql_identifier(name: str) -> str:
    return re.sub(r"[^a-zA-Z0-9_]", "_", name)
@ -398,14 +345,10 @@ class SQLiteVecIndex(EmbeddingIndex):
            for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
        }
-        # Combine scores using the specified reranker
+        # Combine scores using the reranking utility
-        if reranker_type == RERANKER_TYPE_WEIGHTED:
+        combined_scores = WeightedInMemoryAggregator.combine_search_results(
-            alpha = reranker_params.get("alpha", 0.5)
+            vector_scores, keyword_scores, reranker_type, reranker_params
-            combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha)
+        )
        else:
            # Default to RRF for None, RRF, or any unknown types
            impact_factor = reranker_params.get("impact_factor", 60.0)
            combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
        # Sort by combined score and get top k results
        sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -116,7 +116,7 @@ def available_providers() -> list[ProviderSpec]:
            adapter=AdapterSpec(
                adapter_type="fireworks",
                pip_packages=[
-                    "fireworks-ai",
+                    "fireworks-ai<=0.17.16",
                ],
                module="llama_stack.providers.remote.inference.fireworks",
                config_class="llama_stack.providers.remote.inference.fireworks.FireworksImplConfig",
@ -207,7 +207,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="gemini",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.gemini",
                config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
                provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
@ -248,7 +248,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="groq",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.groq",
                config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
                provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
@ -270,7 +270,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="sambanova",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.sambanova",
                config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
                provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
@ -292,7 +292,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="watsonx",
-                pip_packages=["ibm_watson_machine_learning"],
+                pip_packages=["ibm_watsonx_ai"],
                module="llama_stack.providers.remote.inference.watsonx",
                config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig",
                provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator",
--- a/llama_stack/providers/registry/tool_runtime.py
+++ b/llama_stack/providers/registry/tool_runtime.py
@ -32,7 +32,7 @@ def available_providers() -> list[ProviderSpec]:
            ],
            module="llama_stack.providers.inline.tool_runtime.rag",
            config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
-            api_dependencies=[Api.vector_io, Api.inference],
+            api_dependencies=[Api.vector_io, Api.inference, Api.files],
            description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.",
        ),
        remote_provider_spec(
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@ -404,6 +404,60 @@ That means you'll get fast and efficient vector retrieval.
 - Easy to use
 - Fully integrated with Llama Stack
 There are three implementations of search for PGVectoIndex available:
 1. Vector Search:
 - How it works:
  - Uses PostgreSQL's vector extension (pgvector) to perform similarity search
  - Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
  - Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
 -Characteristics:
  - Semantic understanding - finds documents similar in meaning even if they don't share keywords
  - Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
  - Best for: Finding conceptually related content, handling synonyms, cross-language search
 2. Keyword Search
 - How it works:
  - Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
  - Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
  - Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
 - Characteristics:
  - Lexical matching - finds exact keyword matches and variations
  - Uses GIN (Generalized Inverted Index) for fast text search performance
  - Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
  - Best for: Exact term matching, proper names, technical terms, Boolean-style queries
 3. Hybrid Search
 - How it works:
  - Combines both vector and keyword search results
  - Runs both searches independently, then merges results using configurable reranking
 - Two reranking strategies available:
    - Reciprocal Rank Fusion (RRF) - (default: 60.0)
    - Weighted Average - (default: 0.5)
 - Characteristics:
  - Best of both worlds: semantic understanding + exact matching
  - Documents appearing in both searches get boosted scores
  - Configurable balance between semantic and lexical matching
  - Best for: General-purpose search where you want both precision and recall
 4. Database Schema
 The PGVector implementation stores data optimized for all three search types:
 CREATE TABLE vector_store_xxx (
    id TEXT PRIMARY KEY,
    document JSONB,                    -- Original document
    embedding vector(dimension),        -- For vector search
    content_text TEXT,                 -- Raw text content
    tokenized_content TSVECTOR          -- For keyword search
 );
 -- Indexes for performance
 CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content);  -- Keyword search
 -- Vector index created automatically by pgvector
 ## Usage
 To use PGVector in your Llama Stack project, follow these steps:
@ -412,6 +466,25 @@ To use PGVector in your Llama Stack project, follow these steps:
 2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
 3. Start storing and querying vectors.
 ## This is an example how you can set up your environment for using PGVector
 1. Export env vars:
 ```bash
 export ENABLE_PGVECTOR=true
 export PGVECTOR_HOST=localhost
 export PGVECTOR_PORT=5432
 export PGVECTOR_DB=llamastack
 export PGVECTOR_USER=llamastack
 export PGVECTOR_PASSWORD=llamastack
 ```
 2. Create DB:
 ```bash
 psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
 psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
 psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
 ```
 ## Installation
 You can install PGVector using docker:
@ -449,6 +522,7 @@ Weaviate supports:
 - Metadata filtering
 - Multi-modal retrieval
 ## Usage
 To use Weaviate in your Llama Stack project, follow these steps:
--- a/llama_stack/providers/remote/files/s3/init.py
+++ b/llama_stack/providers/remote/files/s3/init.py
@ -6,15 +6,14 @@
 from typing import Any
-from llama_stack.core.datatypes import Api
+from llama_stack.core.datatypes import AccessRule, Api
 from .config import S3FilesImplConfig
-async def get_adapter_impl(config: S3FilesImplConfig, deps: dict[Api, Any]):
+async def get_adapter_impl(config: S3FilesImplConfig, deps: dict[Api, Any], policy: list[AccessRule] | None = None):
    from .files import S3FilesImpl
-    # TODO: authorization policies and user separation
+    impl = S3FilesImpl(config, policy or [])
    impl = S3FilesImpl(config)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/files/s3/files.py
+++ b/llama_stack/providers/remote/files/s3/files.py
@ -4,9 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import time
 import uuid
-from typing import Annotated
+from datetime import UTC, datetime
 from typing import Annotated, Any
 import boto3
 from botocore.exceptions import BotoCoreError, ClientError, NoCredentialsError
@ -15,14 +15,17 @@ from fastapi import File, Form, Response, UploadFile
 from llama_stack.apis.common.errors import ResourceNotFoundError
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.files import (
    ExpiresAfter,
    Files,
    ListOpenAIFileResponse,
    OpenAIFileDeleteResponse,
    OpenAIFileObject,
    OpenAIFilePurpose,
 )
 from llama_stack.core.datatypes import AccessRule
 from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
-from llama_stack.providers.utils.sqlstore.sqlstore import SqlStore, sqlstore_impl
+from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
 from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl
 from .config import S3FilesImplConfig
@ -83,22 +86,85 @@ async def _create_bucket_if_not_exists(client: boto3.client, config: S3FilesImpl
            raise RuntimeError(f"Failed to access S3 bucket '{config.bucket_name}': {e}") from e
 def _make_file_object(
    *,
    id: str,
    filename: str,
    purpose: str,
    bytes: int,
    created_at: int,
    expires_at: int,
    **kwargs: Any,  # here to ignore any additional fields, e.g. extra fields from AuthorizedSqlStore
 ) -> OpenAIFileObject:
    """
    Construct an OpenAIFileObject and normalize expires_at.
    If expires_at is greater than the max we treat it as no-expiration and
    return None for expires_at.
    The OpenAI spec says expires_at type is Integer, but the implementation
    will return None for no expiration.
    """
    obj = OpenAIFileObject(
        id=id,
        filename=filename,
        purpose=OpenAIFilePurpose(purpose),
        bytes=bytes,
        created_at=created_at,
        expires_at=expires_at,
    )
    if obj.expires_at is not None and obj.expires_at > (obj.created_at + ExpiresAfter.MAX):
        obj.expires_at = None  # type: ignore
    return obj
 class S3FilesImpl(Files):
    """S3-based implementation of the Files API."""
-    # TODO: implement expiration, for now a silly offset
+    def __init__(self, config: S3FilesImplConfig, policy: list[AccessRule]) -> None:
    _SILLY_EXPIRATION_OFFSET = 100 * 365 * 24 * 60 * 60
    def __init__(self, config: S3FilesImplConfig) -> None:
        self._config = config
        self.policy = policy
        self._client: boto3.client | None = None
-        self._sql_store: SqlStore | None = None
+        self._sql_store: AuthorizedSqlStore | None = None
    def _now(self) -> int:
        """Return current UTC timestamp as int seconds."""
        return int(datetime.now(UTC).timestamp())
    async def _get_file(self, file_id: str, return_expired: bool = False) -> dict[str, Any]:
        where: dict[str, str | dict] = {"id": file_id}
        if not return_expired:
            where["expires_at"] = {">": self._now()}
        if not (row := await self.sql_store.fetch_one("openai_files", policy=self.policy, where=where)):
            raise ResourceNotFoundError(file_id, "File", "files.list()")
        return row
    async def _delete_file(self, file_id: str) -> None:
        """Delete a file from S3 and the database."""
        try:
            self.client.delete_object(
                Bucket=self._config.bucket_name,
                Key=file_id,
            )
        except ClientError as e:
            if e.response["Error"]["Code"] != "NoSuchKey":
                raise RuntimeError(f"Failed to delete file from S3: {e}") from e
        await self.sql_store.delete("openai_files", where={"id": file_id})
    async def _delete_if_expired(self, file_id: str) -> None:
        """If the file exists and is expired, delete it."""
        if row := await self._get_file(file_id, return_expired=True):
            if (expires_at := row.get("expires_at")) and expires_at <= self._now():
                await self._delete_file(file_id)
    async def initialize(self) -> None:
        self._client = _create_s3_client(self._config)
        await _create_bucket_if_not_exists(self._client, self._config)
-        self._sql_store = sqlstore_impl(self._config.metadata_store)
+        self._sql_store = AuthorizedSqlStore(sqlstore_impl(self._config.metadata_store))
        await self._sql_store.create_table(
            "openai_files",
            {
@ -121,7 +187,7 @@ class S3FilesImpl(Files):
        return self._client
    @property
-    def sql_store(self) -> SqlStore:
+    def sql_store(self) -> AuthorizedSqlStore:
        assert self._sql_store is not None, "Provider not initialized"
        return self._sql_store
@ -129,27 +195,47 @@ class S3FilesImpl(Files):
        self,
        file: Annotated[UploadFile, File()],
        purpose: Annotated[OpenAIFilePurpose, Form()],
        expires_after_anchor: Annotated[str | None, Form(alias="expires_after[anchor]")] = None,
        expires_after_seconds: Annotated[int | None, Form(alias="expires_after[seconds]")] = None,
    ) -> OpenAIFileObject:
        file_id = f"file-{uuid.uuid4().hex}"
        filename = getattr(file, "filename", None) or "uploaded_file"
-        created_at = int(time.time())
+        created_at = self._now()
-        expires_at = created_at + self._SILLY_EXPIRATION_OFFSET
+
        expires_after = None
        if expires_after_anchor is not None or expires_after_seconds is not None:
            # we use ExpiresAfter to validate input
            expires_after = ExpiresAfter(
                anchor=expires_after_anchor,  # type: ignore[arg-type]
                seconds=expires_after_seconds,  # type: ignore[arg-type]
            )
        # the default is no expiration.
        # to implement no expiration we set an expiration beyond the max.
        # we'll hide this fact from users when returning the file object.
        expires_at = created_at + ExpiresAfter.MAX * 42
        # the default for BATCH files is 30 days, which happens to be the expiration max.
        if purpose == OpenAIFilePurpose.BATCH:
            expires_at = created_at + ExpiresAfter.MAX
        if expires_after is not None:
            expires_at = created_at + expires_after.seconds
        content = await file.read()
        file_size = len(content)
-        await self.sql_store.insert(
+        entry: dict[str, Any] = {
-            "openai_files",
+            "id": file_id,
-            {
+            "filename": filename,
-                "id": file_id,
+            "purpose": purpose.value,
-                "filename": filename,
+            "bytes": file_size,
-                "purpose": purpose.value,
+            "created_at": created_at,
-                "bytes": file_size,
+            "expires_at": expires_at,
-                "created_at": created_at,
+        }
-                "expires_at": expires_at,
+
-            },
+        await self.sql_store.insert("openai_files", entry)
        )
        try:
            self.client.put_object(
@ -163,14 +249,7 @@ class S3FilesImpl(Files):
            raise RuntimeError(f"Failed to upload file to S3: {e}") from e
-        return OpenAIFileObject(
+        return _make_file_object(**entry)
            id=file_id,
            filename=filename,
            purpose=purpose,
            bytes=file_size,
            created_at=created_at,
            expires_at=expires_at,
        )
    async def openai_list_files(
        self,
@ -183,29 +262,20 @@ class S3FilesImpl(Files):
        if not order:
            order = Order.desc
-        where_conditions = {}
+        where_conditions: dict[str, Any] = {"expires_at": {">": self._now()}}
        if purpose:
            where_conditions["purpose"] = purpose.value
        paginated_result = await self.sql_store.fetch_all(
            table="openai_files",
-            where=where_conditions if where_conditions else None,
+            policy=self.policy,
            where=where_conditions,
            order_by=[("created_at", order.value)],
            cursor=("id", after) if after else None,
            limit=limit,
        )
-        files = [
+        files = [_make_file_object(**row) for row in paginated_result.data]
            OpenAIFileObject(
                id=row["id"],
                filename=row["filename"],
                purpose=OpenAIFilePurpose(row["purpose"]),
                bytes=row["bytes"],
                created_at=row["created_at"],
                expires_at=row["expires_at"],
            )
            for row in paginated_result.data
        ]
        return ListOpenAIFileResponse(
            data=files,
@ -216,41 +286,20 @@ class S3FilesImpl(Files):
        )
    async def openai_retrieve_file(self, file_id: str) -> OpenAIFileObject:
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        await self._delete_if_expired(file_id)
-        if not row:
+        row = await self._get_file(file_id)
-            raise ResourceNotFoundError(file_id, "File", "files.list()")
+        return _make_file_object(**row)
        return OpenAIFileObject(
            id=row["id"],
            filename=row["filename"],
            purpose=OpenAIFilePurpose(row["purpose"]),
            bytes=row["bytes"],
            created_at=row["created_at"],
            expires_at=row["expires_at"],
        )
    async def openai_delete_file(self, file_id: str) -> OpenAIFileDeleteResponse:
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        await self._delete_if_expired(file_id)
-        if not row:
+        _ = await self._get_file(file_id)  # raises if not found
-            raise ResourceNotFoundError(file_id, "File", "files.list()")
+        await self._delete_file(file_id)
        try:
            self.client.delete_object(
                Bucket=self._config.bucket_name,
                Key=row["id"],
            )
        except ClientError as e:
            if e.response["Error"]["Code"] != "NoSuchKey":
                raise RuntimeError(f"Failed to delete file from S3: {e}") from e
        await self.sql_store.delete("openai_files", where={"id": file_id})
        return OpenAIFileDeleteResponse(id=file_id, deleted=True)
    async def openai_retrieve_file_content(self, file_id: str) -> Response:
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
+        await self._delete_if_expired(file_id)
-        if not row:
+
-            raise ResourceNotFoundError(file_id, "File", "files.list()")
+        row = await self._get_file(file_id)
        try:
            response = self.client.get_object(
@ -261,7 +310,7 @@ class S3FilesImpl(Files):
            content = response["Body"].read()
        except ClientError as e:
            if e.response["Error"]["Code"] == "NoSuchKey":
-                await self.sql_store.delete("openai_files", where={"id": file_id})
+                await self._delete_file(file_id)
                raise ResourceNotFoundError(file_id, "File", "files.list()") from e
            raise RuntimeError(f"Failed to download file from S3: {e}") from e
--- a/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/llama_stack/providers/remote/inference/gemini/gemini.py
@ -5,12 +5,13 @@
 # the root directory of this source tree.
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import GeminiConfig
 from .models import MODEL_ENTRIES
-class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
+class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    def __init__(self, config: GeminiConfig) -> None:
        LiteLLMOpenAIMixin.__init__(
            self,
@ -21,6 +22,11 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self):
        return "https://generativelanguage.googleapis.com/v1beta/openai/"
    async def initialize(self) -> None:
        await super().initialize()
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -4,30 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections.abc import AsyncIterator
 from typing import Any
 from openai import AsyncOpenAI
 from llama_stack.apis.inference import (
    OpenAIChatCompletion,
    OpenAIChatCompletionChunk,
    OpenAIChoiceDelta,
    OpenAIChunkChoice,
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    OpenAISystemMessageParam,
 )
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
-from llama_stack.providers.utils.inference.openai_compat import (
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
    prepare_openai_completion_params,
 )
 from .models import MODEL_ENTRIES
-class GroqInferenceAdapter(LiteLLMOpenAIMixin):
+class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    _config: GroqConfig
    def __init__(self, config: GroqConfig):
@ -40,122 +25,14 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config
    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self) -> str:
        return f"{self.config.url}/openai/v1"
    async def initialize(self):
        await super().initialize()
    async def shutdown(self):
        await super().shutdown()
    def _get_openai_client(self) -> AsyncOpenAI:
        return AsyncOpenAI(
            base_url=f"{self.config.url}/openai/v1",
            api_key=self.get_api_key(),
        )
    async def openai_chat_completion(
        self,
        model: str,
        messages: list[OpenAIMessageParam],
        frequency_penalty: float | None = None,
        function_call: str | dict[str, Any] | None = None,
        functions: list[dict[str, Any]] | None = None,
        logit_bias: dict[str, float] | None = None,
        logprobs: bool | None = None,
        max_completion_tokens: int | None = None,
        max_tokens: int | None = None,
        n: int | None = None,
        parallel_tool_calls: bool | None = None,
        presence_penalty: float | None = None,
        response_format: OpenAIResponseFormatParam | None = None,
        seed: int | None = None,
        stop: str | list[str] | None = None,
        stream: bool | None = None,
        stream_options: dict[str, Any] | None = None,
        temperature: float | None = None,
        tool_choice: str | dict[str, Any] | None = None,
        tools: list[dict[str, Any]] | None = None,
        top_logprobs: int | None = None,
        top_p: float | None = None,
        user: str | None = None,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
        model_obj = await self.model_store.get_model(model)
        # Groq does not support json_schema response format, so we need to convert it to json_object
        if response_format and response_format.type == "json_schema":
            response_format.type = "json_object"
            schema = response_format.json_schema.get("schema", {})
            response_format.json_schema = None
            json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
            if messages and messages[0].role == "system":
                messages[0].content = messages[0].content + json_instructions
            else:
                messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
        # Groq returns a 400 error if tools are provided but none are called
        # So, set tool_choice to "required" to attempt to force a call
        if tools and (not tool_choice or tool_choice == "auto"):
            tool_choice = "required"
        params = await prepare_openai_completion_params(
            model=model_obj.provider_resource_id,
            messages=messages,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            presence_penalty=presence_penalty,
            response_format=response_format,
            seed=seed,
            stop=stop,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
        )
        # Groq does not support streaming requests that set response_format
        fake_stream = False
        if stream and response_format:
            params["stream"] = False
            fake_stream = True
        response = await self._get_openai_client().chat.completions.create(**params)
        if fake_stream:
            chunk_choices = []
            for choice in response.choices:
                delta = OpenAIChoiceDelta(
                    content=choice.message.content,
                    role=choice.message.role,
                    tool_calls=choice.message.tool_calls,
                )
                chunk_choice = OpenAIChunkChoice(
                    delta=delta,
                    finish_reason=choice.finish_reason,
                    index=choice.index,
                    logprobs=None,
                )
                chunk_choices.append(chunk_choice)
            chunk = OpenAIChatCompletionChunk(
                id=response.id,
                choices=chunk_choices,
                object="chat.completion.chunk",
                created=response.created,
                model=response.model,
            )
            async def _fake_stream_generator():
                yield chunk
            return _fake_stream_generator()
        else:
            return response
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -41,10 +41,10 @@ client.initialize()
 ### Create Completion
-> Note on Completion API
+The following example shows how to create a completion for an NVIDIA NIM.
 >
 > The hosted NVIDIA Llama NIMs (e.g., `meta-llama/Llama-3.1-8B-Instruct`) with ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` does not support the ```completion``` method, while the locally deployed NIM does.
 > [!NOTE]
 > The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do.
 ```python
 response = client.inference.completion(
@ -60,6 +60,8 @@ print(f"Response: {response.content}")
 ### Create Chat Completion
 The following example shows how to create a chat completion for an NVIDIA NIM.
 ```python
 response = client.inference.chat_completion(
    model_id="meta-llama/Llama-3.1-8B-Instruct",
@ -82,6 +84,9 @@ print(f"Response: {response.completion_message.content}")
 ```
 ### Tool Calling Example ###
 The following example shows how to do tool calling for an NVIDIA NIM.
 ```python
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition
@ -117,6 +122,9 @@ if tool_response.completion_message.tool_calls:
 ```
 ### Structured Output Example
 The following example shows how to do structured output for an NVIDIA NIM.
 ```python
 from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType
@ -149,8 +157,10 @@ print(f"Structured Response: {structured_response.completion_message.content}")
 ```
 ### Create Embeddings
-> Note on OpenAI embeddings compatibility
+
->
+The following example shows how to create embeddings for an NVIDIA NIM.
 > [!NOTE]
 > NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.
 ```python
@ -161,3 +171,41 @@ response = client.inference.embeddings(
 )
 print(f"Embeddings: {response.embeddings}")
 ```
 ### Vision Language Models Example
 The following example shows how to run vision inference by using an NVIDIA NIM.
 ```python
 def load_image_as_base64(image_path):
    with open(image_path, "rb") as image_file:
        img_bytes = image_file.read()
        return base64.b64encode(img_bytes).decode("utf-8")
 image_path = {path_to_the_image}
 demo_image_b64 = load_image_as_base64(image_path)
 vlm_response = client.inference.chat_completion(
    model_id="nvidia/vila",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": {
                        "data": demo_image_b64,
                    },
                },
                {
                    "type": "text",
                    "text": "Please describe what you see in this image in detail.",
                },
            ],
        }
    ],
 )
 print(f"VLM Response: {vlm_response.completion_message.content}")
 ```
--- a/llama_stack/providers/remote/inference/nvidia/models.py
+++ b/llama_stack/providers/remote/inference/nvidia/models.py
@ -55,6 +55,10 @@ MODEL_ENTRIES = [
        "meta/llama-3.3-70b-instruct",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
    ProviderModelEntry(
        provider_model_id="nvidia/vila",
        model_type=ModelType.llm,
    ),
    # NeMo Retriever Text Embedding models -
    #
    # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -118,10 +118,10 @@ class OllamaInferenceAdapter(
    async def initialize(self) -> None:
        logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
-        health_response = await self.health()
+        r = await self.health()
-        if health_response["status"] == HealthStatus.ERROR:
+        if r["status"] == HealthStatus.ERROR:
            logger.warning(
-                "Ollama Server is not running, make sure to start it using `ollama serve` in a separate terminal"
+                f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
            )
    async def should_refresh_models(self) -> bool:
@ -156,7 +156,7 @@ class OllamaInferenceAdapter(
            ),
            Model(
                identifier="nomic-embed-text",
-                provider_resource_id="nomic-embed-text",
+                provider_resource_id="nomic-embed-text:latest",
                provider_id=provider_id,
                metadata={
                    "embedding_dimension": 768,
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -4,13 +4,26 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from .config import SambaNovaImplConfig
 from .models import MODEL_ENTRIES
-class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
+class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    """
    SambaNova Inference Adapter for Llama Stack.
    Note: The inheritance order is important here. OpenAIMixin must come before
    LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
    is used instead of LiteLLMOpenAIMixin.check_model_availability().
    - OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists
    - LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM
    """
    def __init__(self, config: SambaNovaImplConfig):
        self.config = config
        self.environment_available_models = []
@ -24,3 +37,14 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
            download_images=True,  # SambaNova requires base64 image encoding
            json_schema_strict=False,  # SambaNova doesn't support strict=True yet
        )
    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
    get_api_key = LiteLLMOpenAIMixin.get_api_key
    def get_base_url(self) -> str:
        """
        Get the base URL for OpenAI mixin.
        :return: The SambaNova base URL
        """
        return self.config.url
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -7,8 +7,8 @@
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any
-from ibm_watson_machine_learning.foundation_models import Model
+from ibm_watsonx_ai.foundation_models import Model
-from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
+from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
 from openai import AsyncOpenAI
 from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import heapq
 from typing import Any
 import psycopg2
@ -23,6 +24,9 @@ from llama_stack.apis.vector_io import (
 )
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import (
    interleaved_content_as_str,
 )
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
@ -31,6 +35,7 @@ from llama_stack.providers.utils.memory.vector_store import (
    EmbeddingIndex,
    VectorDBWithIndex,
 )
 from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator, sanitize_collection_name
 from .config import PGVectorVectorIOConfig
@ -72,25 +77,63 @@ def load_models(cur, cls):
 class PGVectorIndex(EmbeddingIndex):
-    def __init__(self, vector_db: VectorDB, dimension: int, conn, kvstore: KVStore | None = None):
+    # reference: https://github.com/pgvector/pgvector?tab=readme-ov-file#querying
-        self.conn = conn
+    PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION: dict[str, str] = {
-        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+        "L2": "<->",
-            # Sanitize the table name by replacing hyphens with underscores
+        "L1": "<+>",
-            # SQL doesn't allow hyphens in table names, and vector_db.identifier may contain hyphens
+        "COSINE": "<=>",
-            # when created with patterns like "test-vector-db-{uuid4()}"
+        "INNER_PRODUCT": "<#>",
-            sanitized_identifier = vector_db.identifier.replace("-", "_")
+        "HAMMING": "<~>",
-            self.table_name = f"vector_store_{sanitized_identifier}"
+        "JACCARD": "<%>",
-            self.kvstore = kvstore
+    }
-            cur.execute(
+    def __init__(
-                f"""
+        self,
-                CREATE TABLE IF NOT EXISTS {self.table_name} (
+        vector_db: VectorDB,
-                    id TEXT PRIMARY KEY,
+        dimension: int,
-                    document JSONB,
+        conn: psycopg2.extensions.connection,
-                    embedding vector({dimension})
+        kvstore: KVStore | None = None,
        distance_metric: str = "COSINE",
    ):
        self.vector_db = vector_db
        self.dimension = dimension
        self.conn = conn
        self.kvstore = kvstore
        self.check_distance_metric_availability(distance_metric)
        self.distance_metric = distance_metric
        self.table_name = None
    async def initialize(self) -> None:
        try:
            with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
                # Sanitize the table name by replacing hyphens with underscores
                # SQL doesn't allow hyphens in table names, and vector_db.identifier may contain hyphens
                # when created with patterns like "test-vector-db-{uuid4()}"
                sanitized_identifier = sanitize_collection_name(self.vector_db.identifier)
                self.table_name = f"vs_{sanitized_identifier}"
                cur.execute(
                    f"""
                    CREATE TABLE IF NOT EXISTS {self.table_name} (
                        id TEXT PRIMARY KEY,
                        document JSONB,
                        embedding vector({self.dimension}),
                        content_text TEXT,
                        tokenized_content TSVECTOR
                    )
                """
                )
-            """
+
-            )
+                # Create GIN index for full-text search performance
                cur.execute(
                    f"""
                    CREATE INDEX IF NOT EXISTS {self.table_name}_content_gin_idx
                    ON {self.table_name} USING GIN(tokenized_content)
                """
                )
        except Exception as e:
            log.exception(f"Error creating PGVectorIndex for vector_db: {self.vector_db.identifier}")
            raise RuntimeError(f"Error creating PGVectorIndex for vector_db: {self.vector_db.identifier}") from e
    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        assert len(chunks) == len(embeddings), (
@ -99,29 +142,49 @@ class PGVectorIndex(EmbeddingIndex):
        values = []
        for i, chunk in enumerate(chunks):
            content_text = interleaved_content_as_str(chunk.content)
            values.append(
                (
                    f"{chunk.chunk_id}",
                    Json(chunk.model_dump()),
                    embeddings[i].tolist(),
                    content_text,
                    content_text,  # Pass content_text twice - once for content_text column, once for to_tsvector function. Eg. to_tsvector(content_text) = tokenized_content
                )
            )
        query = sql.SQL(
            f"""
-        INSERT INTO {self.table_name} (id, document, embedding)
+        INSERT INTO {self.table_name} (id, document, embedding, content_text, tokenized_content)
        VALUES %s
-        ON CONFLICT (id) DO UPDATE SET embedding = EXCLUDED.embedding, document = EXCLUDED.document
+        ON CONFLICT (id) DO UPDATE SET
            embedding = EXCLUDED.embedding,
            document = EXCLUDED.document,
            content_text = EXCLUDED.content_text,
            tokenized_content = EXCLUDED.tokenized_content
    """
        )
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            execute_values(cur, query, values, template="(%s, %s, %s::vector)")
+            execute_values(cur, query, values, template="(%s, %s, %s::vector, %s, to_tsvector('english', %s))")
    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
        """
        Performs vector similarity search using PostgreSQL's search function. Default distance metric is COSINE.
        Args:
            embedding: The query embedding vector
            k: Number of results to return
            score_threshold: Minimum similarity score threshold
        Returns:
            QueryChunksResponse with combined results
        """
        pgvector_search_function = self.get_pgvector_search_function()
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(
                f"""
-            SELECT document, embedding <-> %s::vector AS distance
+            SELECT document, embedding {pgvector_search_function} %s::vector AS distance
            FROM {self.table_name}
            ORDER BY distance
            LIMIT %s
@ -147,7 +210,40 @@ class PGVectorIndex(EmbeddingIndex):
        k: int,
        score_threshold: float,
    ) -> QueryChunksResponse:
-        raise NotImplementedError("Keyword search is not supported in PGVector")
+        """
        Performs keyword-based search using PostgreSQL's full-text search with ts_rank scoring.
        Args:
            query_string: The text query for keyword search
            k: Number of results to return
            score_threshold: Minimum similarity score threshold
        Returns:
            QueryChunksResponse with combined results
        """
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            # Use plainto_tsquery to handle user input safely and ts_rank for relevance scoring
            cur.execute(
                f"""
            SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
            FROM {self.table_name}
            WHERE tokenized_content @@ plainto_tsquery('english', %s)
            ORDER BY score DESC
            LIMIT %s
        """,
                (query_string, query_string, k),
            )
            results = cur.fetchall()
            chunks = []
            scores = []
            for doc, score in results:
                if score < score_threshold:
                    continue
                chunks.append(Chunk(**doc))
                scores.append(float(score))
            return QueryChunksResponse(chunks=chunks, scores=scores)
    async def query_hybrid(
        self,
@ -158,7 +254,59 @@ class PGVectorIndex(EmbeddingIndex):
        reranker_type: str,
        reranker_params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
-        raise NotImplementedError("Hybrid search is not supported in PGVector")
+        """
        Hybrid search combining vector similarity and keyword search using configurable reranking.
        Args:
            embedding: The query embedding vector
            query_string: The text query for keyword search
            k: Number of results to return
            score_threshold: Minimum similarity score threshold
            reranker_type: Type of reranker to use ("rrf" or "weighted")
            reranker_params: Parameters for the reranker
        Returns:
            QueryChunksResponse with combined results
        """
        if reranker_params is None:
            reranker_params = {}
        # Get results from both search methods
        vector_response = await self.query_vector(embedding, k, score_threshold)
        keyword_response = await self.query_keyword(query_string, k, score_threshold)
        # Convert responses to score dictionaries using chunk_id
        vector_scores = {
            chunk.chunk_id: score for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
        }
        keyword_scores = {
            chunk.chunk_id: score
            for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
        }
        # Combine scores using the reranking utility
        combined_scores = WeightedInMemoryAggregator.combine_search_results(
            vector_scores, keyword_scores, reranker_type, reranker_params
        )
        # Efficient top-k selection because it only tracks the k best candidates it's seen so far
        top_k_items = heapq.nlargest(k, combined_scores.items(), key=lambda x: x[1])
        # Filter by score threshold
        filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
        # Create a map of chunk_id to chunk for both responses
        chunk_map = {c.chunk_id: c for c in vector_response.chunks + keyword_response.chunks}
        # Use the map to look up chunks by their IDs
        chunks = []
        scores = []
        for doc_id, score in filtered_items:
            if doc_id in chunk_map:
                chunks.append(chunk_map[doc_id])
                scores.append(score)
        return QueryChunksResponse(chunks=chunks, scores=scores)
    async def delete(self):
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
@ -170,6 +318,25 @@ class PGVectorIndex(EmbeddingIndex):
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(f"DELETE FROM {self.table_name} WHERE id = ANY(%s)", (chunk_ids,))
    def get_pgvector_search_function(self) -> str:
        return self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION[self.distance_metric]
    def check_distance_metric_availability(self, distance_metric: str) -> None:
        """Check if the distance metric is supported by PGVector.
        Args:
            distance_metric: The distance metric to check
        Raises:
            ValueError: If the distance metric is not supported
        """
        if distance_metric not in self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION:
            supported_metrics = list(self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION.keys())
            raise ValueError(
                f"Distance metric '{distance_metric}' is not supported by PGVector. "
                f"Supported metrics are: {', '.join(supported_metrics)}"
            )
 class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    def __init__(
@ -185,8 +352,8 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        self.files_api = files_api
        self.kvstore: KVStore | None = None
        self.vector_db_store = None
-        self.openai_vector_store: dict[str, dict[str, Any]] = {}
+        self.openai_vector_stores: dict[str, dict[str, Any]] = {}
-        self.metadatadata_collection_name = "openai_vector_stores_metadata"
+        self.metadata_collection_name = "openai_vector_stores_metadata"
    async def initialize(self) -> None:
        log.info(f"Initializing PGVector memory adapter with config: {self.config}")
@ -233,9 +400,13 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        upsert_models(self.conn, [(vector_db.identifier, vector_db)])
        # Create and cache the PGVector index table for the vector DB
        pgvector_index = PGVectorIndex(
            vector_db=vector_db, dimension=vector_db.embedding_dimension, conn=self.conn, kvstore=self.kvstore
        )
        await pgvector_index.initialize()
        index = VectorDBWithIndex(
            vector_db,
-            index=PGVectorIndex(vector_db, vector_db.embedding_dimension, self.conn, kvstore=self.kvstore),
+            index=pgvector_index,
            inference_api=self.inference_api,
        )
        self.cache[vector_db.identifier] = index
@ -272,8 +443,15 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        if vector_db_id in self.cache:
            return self.cache[vector_db_id]
        if self.vector_db_store is None:
            raise VectorStoreNotFoundError(vector_db_id)
        vector_db = await self.vector_db_store.get_vector_db(vector_db_id)
        if not vector_db:
            raise VectorStoreNotFoundError(vector_db_id)
        index = PGVectorIndex(vector_db, vector_db.embedding_dimension, self.conn)
        await index.initialize()
        self.cache[vector_db_id] = VectorDBWithIndex(vector_db, index, self.inference_api)
        return self.cache[vector_db_id]
--- a/llama_stack/providers/utils/bedrock/config.py
+++ b/llama_stack/providers/utils/bedrock/config.py
@ -4,53 +4,55 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 from pydantic import BaseModel, Field
 class BedrockBaseConfig(BaseModel):
    aws_access_key_id: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_ACCESS_KEY_ID"),
        description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID",
    )
    aws_secret_access_key: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_SECRET_ACCESS_KEY"),
        description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY",
    )
    aws_session_token: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_SESSION_TOKEN"),
        description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN",
    )
    region_name: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_DEFAULT_REGION"),
        description="The default AWS Region to use, for example, us-west-1 or us-west-2."
        "Default use environment variable: AWS_DEFAULT_REGION",
    )
    profile_name: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_PROFILE"),
        description="The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE",
    )
    total_max_attempts: int | None = Field(
-        default=None,
+        default_factory=lambda: int(val) if (val := os.getenv("AWS_MAX_ATTEMPTS")) else None,
        description="An integer representing the maximum number of attempts that will be made for a single request, "
        "including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS",
    )
    retry_mode: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_RETRY_MODE"),
        description="A string representing the type of retries Boto3 will perform."
        "Default use environment variable: AWS_RETRY_MODE",
    )
    connect_timeout: float | None = Field(
-        default=60,
+        default_factory=lambda: float(os.getenv("AWS_CONNECT_TIMEOUT", "60")),
        description="The time in seconds till a timeout exception is thrown when attempting to make a connection. "
        "The default is 60 seconds.",
    )
    read_timeout: float | None = Field(
-        default=60,
+        default_factory=lambda: float(os.getenv("AWS_READ_TIMEOUT", "60")),
        description="The time in seconds till a timeout exception is thrown when attempting to read from a connection."
        "The default is 60 seconds.",
    )
    session_ttl: int | None = Field(
-        default=3600,
+        default_factory=lambda: int(os.getenv("AWS_SESSION_TTL", "3600")),
        description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
    )
--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import asyncio
 import base64
 import struct
 from typing import TYPE_CHECKING
@ -43,9 +44,11 @@ class SentenceTransformerEmbeddingMixin:
        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        model = await self.model_store.get_model(model_id)
-        embedding_model = self._load_sentence_transformer_model(model.provider_resource_id)
+        embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id)
-        embeddings = embedding_model.encode(
+        embeddings = await asyncio.to_thread(
-            [interleaved_content_as_str(content) for content in contents], show_progress_bar=False
+            embedding_model.encode,
            [interleaved_content_as_str(content) for content in contents],
            show_progress_bar=False,
        )
        return EmbeddingsResponse(embeddings=embeddings)
@ -64,8 +67,8 @@ class SentenceTransformerEmbeddingMixin:
        # Get the model and generate embeddings
        model_obj = await self.model_store.get_model(model)
-        embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id)
+        embedding_model = await self._load_sentence_transformer_model(model_obj.provider_resource_id)
-        embeddings = embedding_model.encode(input_list, show_progress_bar=False)
+        embeddings = await asyncio.to_thread(embedding_model.encode, input_list, show_progress_bar=False)
        # Convert embeddings to the requested format
        data = []
@ -93,7 +96,7 @@ class SentenceTransformerEmbeddingMixin:
            usage=usage,
        )
-    def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
+    async def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
        global EMBEDDING_MODELS
        loaded_model = EMBEDDING_MODELS.get(model)
@ -101,8 +104,12 @@ class SentenceTransformerEmbeddingMixin:
            return loaded_model
        log.info(f"Loading sentence transformer for {model}...")
        from sentence_transformers import SentenceTransformer
-        loaded_model = SentenceTransformer(model)
+        def _load_model():
            from sentence_transformers import SentenceTransformer
            return SentenceTransformer(model)
        loaded_model = await asyncio.to_thread(_load_model)
        EMBEDDING_MODELS[model] = loaded_model
        return loaded_model
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@ -294,12 +294,12 @@ class VectorDBWithIndex:
                _validate_embedding(c.embedding, i, self.vector_db.embedding_dimension)
        if chunks_to_embed:
-            resp = await self.inference_api.embeddings(
+            resp = await self.inference_api.openai_embeddings(
                self.vector_db.embedding_model,
                [c.content for c in chunks_to_embed],
            )
-            for c, embedding in zip(chunks_to_embed, resp.embeddings, strict=False):
+            for c, data in zip(chunks_to_embed, resp.data, strict=False):
-                c.embedding = embedding
+                c.embedding = data.embedding
        embeddings = np.array([c.embedding for c in chunks], dtype=np.float32)
        await self.index.add_chunks(chunks, embeddings)
@ -334,8 +334,8 @@ class VectorDBWithIndex:
        if mode == "keyword":
            return await self.index.query_keyword(query_string, k, score_threshold)
-        embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
+        embeddings_response = await self.inference_api.openai_embeddings(self.vector_db.embedding_model, [query_string])
-        query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
+        query_vector = np.array(embeddings_response.data[0].embedding, dtype=np.float32)
        if mode == "hybrid":
            return await self.index.query_hybrid(
                query_vector, query_string, k, score_threshold, reranker_type, reranker_params
--- a/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
@ -23,6 +23,7 @@ from sqlalchemy import (
 )
 from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
 from sqlalchemy.ext.asyncio.engine import AsyncEngine
 from sqlalchemy.sql.elements import ColumnElement
 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.log import get_logger
@ -43,6 +44,30 @@ TYPE_MAPPING: dict[ColumnType, Any] = {
 }
 def _build_where_expr(column: ColumnElement, value: Any) -> ColumnElement:
    """Return a SQLAlchemy expression for a where condition.
    `value` may be a simple scalar (equality) or a mapping like {">": 123}.
    The returned expression is a SQLAlchemy ColumnElement usable in query.where(...).
    """
    if isinstance(value, Mapping):
        if len(value) != 1:
            raise ValueError(f"Operator mapping must have a single operator, got: {value}")
        op, operand = next(iter(value.items()))
        if op == "==" or op == "=":
            return column == operand
        if op == ">":
            return column > operand
        if op == "<":
            return column < operand
        if op == ">=":
            return column >= operand
        if op == "<=":
            return column <= operand
        raise ValueError(f"Unsupported operator '{op}' in where mapping")
    return column == value
 class SqlAlchemySqlStoreImpl(SqlStore):
    def __init__(self, config: SqlAlchemySqlStoreConfig):
        self.config = config
@ -111,7 +136,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):
            if where:
                for key, value in where.items():
-                    query = query.where(table_obj.c[key] == value)
+                    query = query.where(_build_where_expr(table_obj.c[key], value))
            if where_sql:
                query = query.where(text(where_sql))
@ -222,7 +247,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):
        async with self.async_session() as session:
            stmt = self.metadata.tables[table].update()
            for key, value in where.items():
-                stmt = stmt.where(self.metadata.tables[table].c[key] == value)
+                stmt = stmt.where(_build_where_expr(self.metadata.tables[table].c[key], value))
            await session.execute(stmt, data)
            await session.commit()
@ -233,7 +258,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):
        async with self.async_session() as session:
            stmt = self.metadata.tables[table].delete()
            for key, value in where.items():
-                stmt = stmt.where(self.metadata.tables[table].c[key] == value)
+                stmt = stmt.where(_build_where_expr(self.metadata.tables[table].c[key], value))
            await session.execute(stmt)
            await session.commit()
--- a/llama_stack/providers/utils/tools/mcp.py
+++ b/llama_stack/providers/utils/tools/mcp.py
@ -67,6 +67,38 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat
                    raise AuthenticationRequiredError(exc) from exc
            if i == len(connection_strategies) - 1:
                raise
        except* httpx.ConnectError as eg:
            # Connection refused, server down, network unreachable
            if i == len(connection_strategies) - 1:
                error_msg = f"Failed to connect to MCP server at {endpoint}: Connection refused"
                logger.error(f"MCP connection error: {error_msg}")
                raise ConnectionError(error_msg) from eg
            else:
                logger.warning(
                    f"failed to connect to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
                )
        except* httpx.TimeoutException as eg:
            # Request timeout, server too slow
            if i == len(connection_strategies) - 1:
                error_msg = f"MCP server at {endpoint} timed out"
                logger.error(f"MCP timeout error: {error_msg}")
                raise TimeoutError(error_msg) from eg
            else:
                logger.warning(
                    f"MCP server at {endpoint} timed out via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
                )
        except* httpx.RequestError as eg:
            # DNS resolution failures, network errors, invalid URLs
            if i == len(connection_strategies) - 1:
                # Get the first exception's message for the error string
                exc_msg = str(eg.exceptions[0]) if eg.exceptions else "Unknown error"
                error_msg = f"Network error connecting to MCP server at {endpoint}: {exc_msg}"
                logger.error(f"MCP network error: {error_msg}")
                raise ConnectionError(error_msg) from eg
            else:
                logger.warning(
                    f"network error connecting to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
                )
        except* McpError:
            if i < len(connection_strategies) - 1:
                logger.warning(
--- a/llama_stack/providers/utils/vector_io/vector_utils.py
+++ b/llama_stack/providers/utils/vector_io/vector_utils.py
@ -37,3 +37,122 @@ def sanitize_collection_name(name: str, weaviate_format=False) -> str:
    else:
        s = proper_case(re.sub(r"[^a-zA-Z0-9]", "", name))
    return s
 class WeightedInMemoryAggregator:
    @staticmethod
    def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
        """
        Normalize scores to 0-1 range using min-max normalization.
        Args:
            scores: dictionary of scores with document IDs as keys and scores as values
        Returns:
            Normalized scores with document IDs as keys and normalized scores as values
        """
        if not scores:
            return {}
        min_score, max_score = min(scores.values()), max(scores.values())
        score_range = max_score - min_score
        if score_range > 0:
            return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
        return dict.fromkeys(scores, 1.0)
    @staticmethod
    def weighted_rerank(
        vector_scores: dict[str, float],
        keyword_scores: dict[str, float],
        alpha: float = 0.5,
    ) -> dict[str, float]:
        """
        Rerank via weighted average of scores.
        Args:
            vector_scores: scores from vector search
            keyword_scores: scores from keyword search
            alpha: weight factor between 0 and 1 (default: 0.5)
                   0 = keyword only, 1 = vector only, 0.5 = equal weight
        Returns:
            All unique document IDs with weighted combined scores
        """
        all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
        normalized_vector_scores = WeightedInMemoryAggregator._normalize_scores(vector_scores)
        normalized_keyword_scores = WeightedInMemoryAggregator._normalize_scores(keyword_scores)
        # Weighted formula: score = (1-alpha) * keyword_score + alpha * vector_score
        # alpha=0 means keyword only, alpha=1 means vector only
        return {
            doc_id: ((1 - alpha) * normalized_keyword_scores.get(doc_id, 0.0))
            + (alpha * normalized_vector_scores.get(doc_id, 0.0))
            for doc_id in all_ids
        }
    @staticmethod
    def rrf_rerank(
        vector_scores: dict[str, float],
        keyword_scores: dict[str, float],
        impact_factor: float = 60.0,
    ) -> dict[str, float]:
        """
        Rerank via Reciprocal Rank Fusion.
        Args:
            vector_scores: scores from vector search
            keyword_scores: scores from keyword search
            impact_factor: impact factor for RRF (default: 60.0)
        Returns:
            All unique document IDs with RRF combined scores
        """
        # Convert scores to ranks
        vector_ranks = {
            doc_id: i + 1
            for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
        }
        keyword_ranks = {
            doc_id: i + 1
            for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
        }
        all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
        rrf_scores = {}
        for doc_id in all_ids:
            vector_rank = vector_ranks.get(doc_id, float("inf"))
            keyword_rank = keyword_ranks.get(doc_id, float("inf"))
            # RRF formula: score = 1/(k + r) where k is impact_factor (default: 60.0) and r is the rank
            rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
        return rrf_scores
    @staticmethod
    def combine_search_results(
        vector_scores: dict[str, float],
        keyword_scores: dict[str, float],
        reranker_type: str = "rrf",
        reranker_params: dict[str, float] | None = None,
    ) -> dict[str, float]:
        """
        Combine vector and keyword search results using specified reranking strategy.
        Args:
            vector_scores: scores from vector search
            keyword_scores: scores from keyword search
            reranker_type: type of reranker to use (default: RERANKER_TYPE_RRF)
            reranker_params: parameters for the reranker
        Returns:
            All unique document IDs with combined scores
        """
        if reranker_params is None:
            reranker_params = {}
        if reranker_type == "weighted":
            alpha = reranker_params.get("alpha", 0.5)
            return WeightedInMemoryAggregator.weighted_rerank(vector_scores, keyword_scores, alpha)
        else:
            # Default to RRF for None, RRF, or any unknown types
            impact_factor = reranker_params.get("impact_factor", 60.0)
            return WeightedInMemoryAggregator.rrf_rerank(vector_scores, keyword_scores, impact_factor)
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -30,6 +30,9 @@ from openai.types.completion_choice import CompletionChoice
 CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None
 CompletionChoice.model_rebuild()
 REPO_ROOT = Path(__file__).parent.parent.parent
 DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/recordings"
 class InferenceMode(StrEnum):
    LIVE = "live"
@ -51,7 +54,7 @@ def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict
 def get_inference_mode() -> InferenceMode:
-    return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "live").lower())
+    return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "replay").lower())
 def setup_inference_recording():
@ -60,28 +63,18 @@ def setup_inference_recording():
    to increase their reliability and reduce reliance on expensive, external services.
    Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases.
    Calls to the /models endpoint are not currently trapped. We probably need to add support for this.
-    Two environment variables are required:
+    Two environment variables are supported:
-    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'.
+    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. Default is 'replay'.
-    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in.
+    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.
-    The recordings are stored in a SQLite database and a JSON file for each request. The SQLite database is used to
+    The recordings are stored as JSON files.
    quickly find the correct recording for a given request. The JSON files are used to store the request and response
    bodies.
    """
    mode = get_inference_mode()
    if mode not in InferenceMode:
        raise ValueError(f"Invalid LLAMA_STACK_TEST_INFERENCE_MODE: {mode}. Must be 'live', 'record', or 'replay'")
    if mode == InferenceMode.LIVE:
        return None
-    if "LLAMA_STACK_TEST_RECORDING_DIR" not in os.environ:
+    storage_dir = os.environ.get("LLAMA_STACK_TEST_RECORDING_DIR", DEFAULT_STORAGE_DIR)
        raise ValueError("LLAMA_STACK_TEST_RECORDING_DIR must be set for recording or replaying")
    storage_dir = os.environ["LLAMA_STACK_TEST_RECORDING_DIR"]
    return inference_recording(mode=mode, storage_dir=storage_dir)
@ -134,8 +127,8 @@ class ResponseStorage:
    def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
        """Store a request/response pair."""
        # Generate unique response filename
-        response_file = f"{request_hash[:12]}.json"
+        short_hash = request_hash[:12]
-        response_path = self.responses_dir / response_file
+        response_file = f"{short_hash}.json"
        # Serialize response body if needed
        serialized_response = dict(response)
@ -147,6 +140,14 @@ class ResponseStorage:
                # Handle single response
                serialized_response["body"] = _serialize_response(serialized_response["body"])
        # If this is an Ollama /api/tags recording, include models digest in filename to distinguish variants
        endpoint = request.get("endpoint")
        if endpoint in ("/api/tags", "/v1/models"):
            digest = _model_identifiers_digest(endpoint, response)
            response_file = f"models-{short_hash}-{digest}.json"
        response_path = self.responses_dir / response_file
        # Save response to JSON file
        with open(response_path, "w") as f:
            json.dump({"request": request, "response": serialized_response}, f, indent=2)
@ -161,19 +162,85 @@ class ResponseStorage:
        if not response_path.exists():
            return None
-        with open(response_path) as f:
+        return _recording_from_file(response_path)
            data = json.load(f)
-        # Deserialize response body if needed
+    def _model_list_responses(self, short_hash: str) -> list[dict[str, Any]]:
-        if "response" in data and "body" in data["response"]:
+        results: list[dict[str, Any]] = []
-            if isinstance(data["response"]["body"], list):
+        for path in self.responses_dir.glob(f"models-{short_hash}-*.json"):
-                # Handle streaming responses
+            data = _recording_from_file(path)
-                data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]]
+            results.append(data)
        return results
 def _recording_from_file(response_path) -> dict[str, Any]:
    with open(response_path) as f:
        data = json.load(f)
    # Deserialize response body if needed
    if "response" in data and "body" in data["response"]:
        if isinstance(data["response"]["body"], list):
            # Handle streaming responses
            data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]]
        else:
            # Handle single response
            data["response"]["body"] = _deserialize_response(data["response"]["body"])
    return cast(dict[str, Any], data)
 def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
    def _extract_model_identifiers():
        """Extract a stable set of identifiers for model-list endpoints.
        Supported endpoints:
        - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
        - '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ]
        Returns a list of unique identifiers or None if structure doesn't match.
        """
        body = response["body"]
        if endpoint == "/api/tags":
            items = body.get("models")
            idents = [m.model for m in items]
        else:
            items = body.get("data")
            idents = [m.id for m in items]
        return sorted(set(idents))
    identifiers = _extract_model_identifiers()
    return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
 def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
    """Return a single, unioned recording for supported model-list endpoints."""
    seen: dict[str, dict[str, Any]] = {}
    for rec in records:
        body = rec["response"]["body"]
        if endpoint == "/api/tags":
            items = body.models
        elif endpoint == "/v1/models":
            items = body.data
        else:
            items = []
        for m in items:
            if endpoint == "/v1/models":
                key = m.id
            else:
-                # Handle single response
+                key = m.model
-                data["response"]["body"] = _deserialize_response(data["response"]["body"])
+            seen[key] = m
-        return cast(dict[str, Any], data)
+    ordered = [seen[k] for k in sorted(seen.keys())]
    canonical = records[0]
    canonical_req = canonical.get("request", {})
    if isinstance(canonical_req, dict):
        canonical_req["endpoint"] = endpoint
    if endpoint == "/v1/models":
        body = {"data": ordered, "object": "list"}
    else:
        from ollama import ListResponse
        body = ListResponse(models=ordered)
    return {"request": canonical_req, "response": {"body": body, "is_streaming": False}}
 async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
@ -195,8 +262,6 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
        raise ValueError(f"Unknown client type: {client_type}")
    url = base_url.rstrip("/") + endpoint
    # Normalize request for matching
    method = "POST"
    headers = {}
    body = kwargs
@ -204,7 +269,12 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
    request_hash = normalize_request(method, url, headers, body)
    if _current_mode == InferenceMode.REPLAY:
-        recording = _current_storage.find_recording(request_hash)
+        # Special handling for model-list endpoints: return union of all responses
        if endpoint in ("/api/tags", "/v1/models"):
            records = _current_storage._model_list_responses(request_hash[:12])
            recording = _combine_model_list_responses(endpoint, records)
        else:
            recording = _current_storage.find_recording(request_hash)
        if recording:
            response_body = recording["response"]["body"]
@ -274,12 +344,14 @@ def patch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
    from openai.resources.models import AsyncModels
    # Store original methods for both OpenAI and Ollama clients
    _original_methods = {
        "chat_completions_create": AsyncChatCompletions.create,
        "completions_create": AsyncCompletions.create,
        "embeddings_create": AsyncEmbeddings.create,
        "models_list": AsyncModels.list,
        "ollama_generate": OllamaAsyncClient.generate,
        "ollama_chat": OllamaAsyncClient.chat,
        "ollama_embed": OllamaAsyncClient.embed,
@ -304,10 +376,16 @@ def patch_inference_clients():
            _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
        )
    async def patched_models_list(self, *args, **kwargs):
        return await _patched_inference_method(
            _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
        )
    # Apply OpenAI patches
    AsyncChatCompletions.create = patched_chat_completions_create
    AsyncCompletions.create = patched_completions_create
    AsyncEmbeddings.create = patched_embeddings_create
    AsyncModels.list = patched_models_list
    # Create patched methods for Ollama client
    async def patched_ollama_generate(self, *args, **kwargs):
@ -361,11 +439,13 @@ def unpatch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
    from openai.resources.models import AsyncModels
    # Restore OpenAI client methods
    AsyncChatCompletions.create = _original_methods["chat_completions_create"]
    AsyncCompletions.create = _original_methods["completions_create"]
    AsyncEmbeddings.create = _original_methods["embeddings_create"]
    AsyncModels.list = _original_methods["models_list"]
    # Restore Ollama client methods if they were patched
    OllamaAsyncClient.generate = _original_methods["ollama_generate"]
@ -379,16 +459,10 @@ def unpatch_inference_clients():
@contextmanager
-def inference_recording(mode: str = "live", storage_dir: str | Path | None = None) -> Generator[None, None, None]:
+def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Generator[None, None, None]:
    """Context manager for inference recording/replaying."""
    global _current_mode, _current_storage
    # Set defaults
    if storage_dir is None:
        storage_dir_path = Path.home() / ".llama" / "recordings"
    else:
        storage_dir_path = Path(storage_dir)
    # Store previous state
    prev_mode = _current_mode
    prev_storage = _current_storage
@ -397,7 +471,9 @@ def inference_recording(mode: str = "live", storage_dir: str | Path | None = Non
        _current_mode = mode
        if mode in ["record", "replay"]:
-            _current_storage = ResponseStorage(storage_dir_path)
+            if storage_dir is None:
                raise ValueError("storage_dir is required for record and replay modes")
            _current_storage = ResponseStorage(Path(storage_dir))
            patch_inference_clients()
        yield
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -14,11 +14,11 @@
        "@radix-ui/react-select": "^2.2.5",
        "@radix-ui/react-separator": "^1.1.7",
        "@radix-ui/react-slot": "^1.2.3",
-        "@radix-ui/react-tooltip": "^1.2.6",
+        "@radix-ui/react-tooltip": "^1.2.8",
        "class-variance-authority": "^0.7.1",
        "clsx": "^2.1.1",
-        "framer-motion": "^11.18.2",
+        "framer-motion": "^12.23.12",
-        "llama-stack-client": "^0.2.19",
+        "llama-stack-client": "^0.2.20",
        "lucide-react": "^0.510.0",
        "next": "15.3.3",
        "next-auth": "^4.24.11",
@ -39,16 +39,16 @@
        "@testing-library/jest-dom": "^6.8.0",
        "@testing-library/react": "^16.3.0",
        "@types/jest": "^29.5.14",
-        "@types/node": "^20",
+        "@types/node": "^24",
        "@types/react": "^19",
        "@types/react-dom": "^19",
        "eslint": "^9",
-        "eslint-config-next": "15.3.2",
+        "eslint-config-next": "15.5.2",
        "eslint-config-prettier": "^10.1.8",
        "eslint-plugin-prettier": "^5.5.4",
        "jest": "^29.7.0",
        "jest-environment-jsdom": "^29.7.0",
-        "prettier": "3.5.3",
+        "prettier": "3.6.2",
        "tailwindcss": "^4",
        "ts-node": "^10.9.2",
        "tw-animate-css": "^1.2.9",
@ -1854,9 +1854,9 @@
      "integrity": "sha512-OdiMrzCl2Xi0VTjiQQUK0Xh7bJHnOuET2s+3V+Y40WJBAXrJeGA3f+I8MZJ/YQ3mVGi5XGR1L66oFlgqXhQ4Vw=="
    },
    "node_modules/@next/eslint-plugin-next": {
-      "version": "15.3.2",
+      "version": "15.5.2",
-      "resolved": "https://registry.npmjs.org/@next/eslint-plugin-next/-/eslint-plugin-next-15.3.2.tgz",
+      "resolved": "https://registry.npmjs.org/@next/eslint-plugin-next/-/eslint-plugin-next-15.5.2.tgz",
-      "integrity": "sha512-ijVRTXBgnHT33aWnDtmlG+LJD+5vhc9AKTJPquGG5NKXjpKNjc62woIhFtrAcWdBobt8kqjCoaJ0q6sDQoX7aQ==",
+      "integrity": "sha512-lkLrRVxcftuOsJNhWatf1P2hNVfh98k/omQHrCEPPriUypR6RcS13IvLdIrEvkm9AH2Nu2YpR5vLqBuy6twH3Q==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
@ -2861,29 +2861,6 @@
        }
      }
    },
    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-visually-hidden": {
      "version": "1.2.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.3.tgz",
      "integrity": "sha512-pzJq12tEaaIhqjbzpCuv/OypJY/BPavOofm+dbab+MHLajy277+1lLm6JFcGgF5eskJ6mquGirhXY2GD/8u8Ug==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-primitive": "2.1.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-separator": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-separator/-/react-separator-1.1.7.tgz",
@ -2949,23 +2926,23 @@
      }
    },
    "node_modules/@radix-ui/react-tooltip": {
-      "version": "1.2.6",
+      "version": "1.2.8",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-tooltip/-/react-tooltip-1.2.6.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-tooltip/-/react-tooltip-1.2.8.tgz",
-      "integrity": "sha512-zYb+9dc9tkoN2JjBDIIPLQtk3gGyz8FMKoqYTb8EMVQ5a5hBcdHPECrsZVI4NpPAUOixhkoqg7Hj5ry5USowfA==",
+      "integrity": "sha512-tY7sVt1yL9ozIxvmbtN5qtmH2krXcBCfjEiCgKGLqunJHvgvZG2Pcl2oQ3kbcZARb1BGEHdkLzcYGO8ynVlieg==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
-        "@radix-ui/react-dismissable-layer": "1.1.9",
+        "@radix-ui/react-dismissable-layer": "1.1.11",
        "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-popper": "1.2.6",
+        "@radix-ui/react-popper": "1.2.8",
-        "@radix-ui/react-portal": "1.1.8",
+        "@radix-ui/react-portal": "1.1.9",
-        "@radix-ui/react-presence": "1.1.4",
+        "@radix-ui/react-presence": "1.1.5",
-        "@radix-ui/react-primitive": "2.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
-        "@radix-ui/react-slot": "1.2.2",
+        "@radix-ui/react-slot": "1.2.3",
        "@radix-ui/react-use-controllable-state": "1.2.2",
-        "@radix-ui/react-visually-hidden": "1.2.2"
+        "@radix-ui/react-visually-hidden": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
@ -2982,21 +2959,162 @@
        }
      }
    },
-    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-slot": {
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/primitive": {
-      "version": "1.2.2",
+      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.2.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
-      "integrity": "sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==",
+      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
      "license": "MIT"
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-arrow": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
      "integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/react-compose-refs": "1.1.2"
+        "@radix-ui/react-primitive": "2.1.3"
      },
      "peerDependencies": {
        "@types/react": "*",
-        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-dismissable-layer": {
      "version": "1.1.11",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
      "integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
        "@radix-ui/react-use-escape-keydown": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-popper": {
      "version": "1.2.8",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
      "integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
      "license": "MIT",
      "dependencies": {
        "@floating-ui/react-dom": "^2.0.0",
        "@radix-ui/react-arrow": "1.1.7",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-callback-ref": "1.1.1",
        "@radix-ui/react-use-layout-effect": "1.1.1",
        "@radix-ui/react-use-rect": "1.1.1",
        "@radix-ui/react-use-size": "1.1.1",
        "@radix-ui/rect": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-portal": {
      "version": "1.1.9",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
      "integrity": "sha512-bpIxvq03if6UNwXZ+HTK71JLh4APvnXntDc6XOX8UVq4XQOVl7lwok0AvIl+b8zgCw3fSaVTZMpAPPagXbKmHQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-primitive": "2.1.3",
        "@radix-ui/react-use-layout-effect": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-presence": {
      "version": "1.1.5",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.5.tgz",
      "integrity": "sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-use-layout-effect": "1.1.1"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-primitive": {
      "version": "2.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
@ -3137,12 +3255,35 @@
      }
    },
    "node_modules/@radix-ui/react-visually-hidden": {
-      "version": "1.2.2",
+      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.2.tgz",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.3.tgz",
-      "integrity": "sha512-ORCmRUbNiZIv6uV5mhFrhsIKw4UX/N3syZtyqvry61tbGm4JlgQuSn0hk5TwCARsCjkcnuRkSdCE3xfb+ADHew==",
+      "integrity": "sha512-pzJq12tEaaIhqjbzpCuv/OypJY/BPavOofm+dbab+MHLajy277+1lLm6JFcGgF5eskJ6mquGirhXY2GD/8u8Ug==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/react-primitive": "2.1.2"
+        "@radix-ui/react-primitive": "2.1.3"
      },
      "peerDependencies": {
        "@types/react": "*",
        "@types/react-dom": "*",
        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
        },
        "@types/react-dom": {
          "optional": true
        }
      }
    },
    "node_modules/@radix-ui/react-visually-hidden/node_modules/@radix-ui/react-primitive": {
      "version": "2.1.3",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
      "license": "MIT",
      "dependencies": {
        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
@ -3910,12 +4051,12 @@
      "license": "MIT"
    },
    "node_modules/@types/node": {
-      "version": "20.17.47",
+      "version": "24.3.0",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.17.47.tgz",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-24.3.0.tgz",
-      "integrity": "sha512-3dLX0Upo1v7RvUimvxLeXqwrfyKxUINk0EAM83swP2mlSUcwV73sZy8XhNz8bcZ3VbsfQyC/y6jRdL5tgCNpDQ==",
+      "integrity": "sha512-aPTXCrfwnDLj4VvXrm+UUCQjNEvJgNA8s5F1cvwQU+3KNltTOkBm1j30uNLyqqPNe7gE3KFzImYoZEfLhp4Yow==",
      "license": "MIT",
      "dependencies": {
-        "undici-types": "~6.19.2"
+        "undici-types": "~7.10.0"
      }
    },
    "node_modules/@types/node-fetch": {
@ -6433,13 +6574,13 @@
      }
    },
    "node_modules/eslint-config-next": {
-      "version": "15.3.2",
+      "version": "15.5.2",
-      "resolved": "https://registry.npmjs.org/eslint-config-next/-/eslint-config-next-15.3.2.tgz",
+      "resolved": "https://registry.npmjs.org/eslint-config-next/-/eslint-config-next-15.5.2.tgz",
-      "integrity": "sha512-FerU4DYccO4FgeYFFglz0SnaKRe1ejXQrDb8kWUkTAg036YWi+jUsgg4sIGNCDhAsDITsZaL4MzBWKB6f4G1Dg==",
+      "integrity": "sha512-3hPZghsLupMxxZ2ggjIIrat/bPniM2yRpsVPVM40rp8ZMzKWOJp2CGWn7+EzoV2ddkUr5fxNfHpF+wU1hGt/3g==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
-        "@next/eslint-plugin-next": "15.3.2",
+        "@next/eslint-plugin-next": "15.5.2",
        "@rushstack/eslint-patch": "^1.10.3",
        "@typescript-eslint/eslint-plugin": "^5.4.2 || ^6.0.0 || ^7.0.0 || ^8.0.0",
        "@typescript-eslint/parser": "^5.4.2 || ^6.0.0 || ^7.0.0 || ^8.0.0",
@ -7268,13 +7409,13 @@
      }
    },
    "node_modules/framer-motion": {
-      "version": "11.18.2",
+      "version": "12.23.12",
-      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-11.18.2.tgz",
+      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-12.23.12.tgz",
-      "integrity": "sha512-5F5Och7wrvtLVElIpclDT0CBzMVg3dL22B64aZwHtsIY8RB4mXICLrkajK4G9R+ieSAGcgrLeae2SeUTg2pr6w==",
+      "integrity": "sha512-6e78rdVtnBvlEVgu6eFEAgG9v3wLnYEboM8I5O5EXvfKC8gxGQB8wXJdhkMy10iVcn05jl6CNw7/HTsTCfwcWg==",
      "license": "MIT",
      "dependencies": {
-        "motion-dom": "^11.18.1",
+        "motion-dom": "^12.23.12",
-        "motion-utils": "^11.18.1",
+        "motion-utils": "^12.23.6",
        "tslib": "^2.4.0"
      },
      "peerDependencies": {
@ -10006,9 +10147,9 @@
      "license": "MIT"
    },
    "node_modules/llama-stack-client": {
-      "version": "0.2.19",
+      "version": "0.2.20",
-      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.19.tgz",
+      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.20.tgz",
-      "integrity": "sha512-sDuAhUdEGlERZ3jlMUzPXcQTgMv/pGbDrPX0ifbE5S+gr7Q+7ohuQYrIXe+hXgIipFjq+y4b2c5laZ76tmAyEA==",
+      "integrity": "sha512-1vD5nizTX5JEW8TADxKgy/P1W8YZoPSpdnmfxbdYbWgpQ3BWtbvLS6jmDk7VwVA5fRC4895VfHsRDfS1liHarw==",
      "license": "MIT",
      "dependencies": {
        "@types/node": "^18.11.18",
@ -11184,18 +11325,18 @@
      }
    },
    "node_modules/motion-dom": {
-      "version": "11.18.1",
+      "version": "12.23.12",
-      "resolved": "https://registry.npmjs.org/motion-dom/-/motion-dom-11.18.1.tgz",
+      "resolved": "https://registry.npmjs.org/motion-dom/-/motion-dom-12.23.12.tgz",
-      "integrity": "sha512-g76KvA001z+atjfxczdRtw/RXOM3OMSdd1f4DL77qCTF/+avrRJiawSG4yDibEQ215sr9kpinSlX2pCTJ9zbhw==",
+      "integrity": "sha512-RcR4fvMCTESQBD/uKQe49D5RUeDOokkGRmz4ceaJKDBgHYtZtntC/s2vLvY38gqGaytinij/yi3hMcWVcEF5Kw==",
      "license": "MIT",
      "dependencies": {
-        "motion-utils": "^11.18.1"
+        "motion-utils": "^12.23.6"
      }
    },
    "node_modules/motion-utils": {
-      "version": "11.18.1",
+      "version": "12.23.6",
-      "resolved": "https://registry.npmjs.org/motion-utils/-/motion-utils-11.18.1.tgz",
+      "resolved": "https://registry.npmjs.org/motion-utils/-/motion-utils-12.23.6.tgz",
-      "integrity": "sha512-49Kt+HKjtbJKLtgO/LKj9Ld+6vw9BjH5d9sc40R/kVyH8GLAXgT42M2NnuPcJNuA3s9ZfZBUcwIgpmZWGEE+hA==",
+      "integrity": "sha512-eAWoPgr4eFEOFfg2WjIsMoqJTW6Z8MTUCgn/GZ3VRpClWBdnbjryiA3ZSNLyxCTmCQx4RmYX6jX1iWHbenUPNQ==",
      "license": "MIT"
    },
    "node_modules/ms": {
@ -12083,9 +12224,9 @@
      }
    },
    "node_modules/prettier": {
-      "version": "3.5.3",
+      "version": "3.6.2",
-      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.5.3.tgz",
+      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.6.2.tgz",
-      "integrity": "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw==",
+      "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==",
      "dev": true,
      "license": "MIT",
      "bin": {
@ -13986,9 +14127,9 @@
      }
    },
    "node_modules/undici-types": {
-      "version": "6.19.8",
+      "version": "7.10.0",
-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.19.8.tgz",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.10.0.tgz",
-      "integrity": "sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==",
+      "integrity": "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag==",
      "license": "MIT"
    },
    "node_modules/unified": {
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -19,11 +19,11 @@
    "@radix-ui/react-select": "^2.2.5",
    "@radix-ui/react-separator": "^1.1.7",
    "@radix-ui/react-slot": "^1.2.3",
-    "@radix-ui/react-tooltip": "^1.2.6",
+    "@radix-ui/react-tooltip": "^1.2.8",
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
-    "framer-motion": "^11.18.2",
+    "framer-motion": "^12.23.12",
-    "llama-stack-client": "^0.2.19",
+    "llama-stack-client": "^0.2.20",
    "lucide-react": "^0.510.0",
    "next": "15.3.3",
    "next-auth": "^4.24.11",
@ -44,16 +44,16 @@
    "@testing-library/jest-dom": "^6.8.0",
    "@testing-library/react": "^16.3.0",
    "@types/jest": "^29.5.14",
-    "@types/node": "^20",
+    "@types/node": "^24",
    "@types/react": "^19",
    "@types/react-dom": "^19",
    "eslint": "^9",
-    "eslint-config-next": "15.3.2",
+    "eslint-config-next": "15.5.2",
    "eslint-config-prettier": "^10.1.8",
    "eslint-plugin-prettier": "^5.5.4",
    "jest": "^29.7.0",
    "jest-environment-jsdom": "^29.7.0",
-    "prettier": "3.5.3",
+    "prettier": "3.6.2",
    "tailwindcss": "^4",
    "ts-node": "^10.9.2",
    "tw-animate-css": "^1.2.9",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,7 +7,7 @@ required-version = ">=0.7.0"
 [project]
 name = "llama_stack"
-version = "0.2.19"
+version = "0.2.20"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@ -31,9 +31,8 @@ dependencies = [
    "huggingface-hub>=0.34.0,<1.0",
    "jinja2>=3.1.6",
    "jsonschema",
-    "llama-stack-client>=0.2.19",
+    "llama-stack-client>=0.2.20",
-    "llama-api-client>=0.1.2",
+    "openai>=1.99.6",
    "openai>=1.99.6,<1.100.0",
    "prompt-toolkit",
    "python-dotenv",
    "python-jose[cryptography]",
@ -56,7 +55,7 @@ dependencies = [
 ui = [
    "streamlit",
    "pandas",
-    "llama-stack-client>=0.2.19",
+    "llama-stack-client>=0.2.20",
    "streamlit-option-menu",
 ]
@ -84,6 +83,7 @@ unit = [
    "openai",
    "aiosqlite",
    "aiohttp",
    "psycopg2-binary>=2.9.0",
    "pypdf",
    "mcp",
    "chardet",
@ -92,7 +92,7 @@ unit = [
    "sqlalchemy[asyncio]>=2.0.41",
    "blobfile",
    "faiss-cpu",
-    "pymilvus>=2.5.12",
+    "pymilvus>=2.6.1",
    "milvus-lite>=2.5.0",
    "litellm",
    "together",
@ -105,12 +105,13 @@ unit = [
 # separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra
 # dependencies.
 test = [
-    "openai",
+    "openai>=1.100.0",  # for expires_after support
    "aiosqlite",
    "aiohttp",
    "torch>=2.6.0",
    "torchvision>=0.21.0",
    "chardet",
    "psycopg2-binary>=2.9.0",
    "pypdf",
    "mcp",
    "datasets",
@ -119,7 +120,7 @@ test = [
    "sqlalchemy",
    "sqlalchemy[asyncio]>=2.0.41",
    "requests",
-    "pymilvus>=2.5.12",
+    "pymilvus>=2.6.1",
    "milvus-lite>=2.5.0",
    "weaviate-client>=4.16.4",
 ]
@ -144,7 +145,7 @@ docs = [
 ]
 codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
 benchmark = [
-    "locust>=2.37.14",
+    "locust>=2.39.1",
 ]
 [project.urls]
--- a/scripts/github/schedule-record-workflow.sh
+++ b/scripts/github/schedule-record-workflow.sh
@ -15,7 +15,7 @@ set -euo pipefail
 BRANCH=""
 TEST_SUBDIRS=""
 TEST_PROVIDER="ollama"
-RUN_VISION_TESTS=false
+TEST_SUITE="base"
 TEST_PATTERN=""
 # Help function
@ -27,9 +27,9 @@ Trigger the integration test recording workflow remotely. This way you do not ne
 OPTIONS:
    -b, --branch BRANCH         Branch to run the workflow on (defaults to current branch)
    -s, --test-subdirs DIRS     Comma-separated list of test subdirectories to run (REQUIRED)
    -p, --test-provider PROVIDER Test provider to use: vllm or ollama (default: ollama)
-    -v, --run-vision-tests      Include vision tests in the recording
+    -t, --test-suite SUITE      Test suite to use: base, responses, vision, etc. (default: base)
    -s, --test-subdirs DIRS     Comma-separated list of test subdirectories to run (overrides suite)
    -k, --test-pattern PATTERN  Regex pattern to pass to pytest -k
    -h, --help                  Show this help message
@ -38,7 +38,7 @@ EXAMPLES:
    $0 --test-subdirs "agents"
    # Record tests for specific branch with vision tests
-    $0 -b my-feature-branch --test-subdirs "inference" --run-vision-tests
+    $0 -b my-feature-branch --test-suite vision
    # Record multiple test subdirectories with specific provider
    $0 --test-subdirs "agents,inference" --test-provider vllm
@ -71,9 +71,9 @@ while [[ $# -gt 0 ]]; do
            TEST_PROVIDER="$2"
            shift 2
            ;;
-        -v|--run-vision-tests)
+        -t|--test-suite)
-            RUN_VISION_TESTS=true
+            TEST_SUITE="$2"
-            shift
+            shift 2
            ;;
        -k|--test-pattern)
            TEST_PATTERN="$2"
@ -92,11 +92,11 @@ while [[ $# -gt 0 ]]; do
 done
 # Validate required parameters
-if [[ -z "$TEST_SUBDIRS" ]]; then
+if [[ -z "$TEST_SUBDIRS" && -z "$TEST_SUITE" ]]; then
-    echo "Error: --test-subdirs is required"
+    echo "Error: --test-subdirs or --test-suite is required"
-    echo "Please specify which test subdirectories to run, e.g.:"
+    echo "Please specify which test subdirectories to run or test suite to use, e.g.:"
    echo "  $0 --test-subdirs \"agents,inference\""
-    echo "  $0 --test-subdirs \"inference\" --run-vision-tests"
+    echo "  $0 --test-suite vision"
    echo ""
    exit 1
 fi
@ -239,17 +239,19 @@ echo "Triggering integration test recording workflow..."
 echo "Branch: $BRANCH"
 echo "Test provider: $TEST_PROVIDER"
 echo "Test subdirs: $TEST_SUBDIRS"
-echo "Run vision tests: $RUN_VISION_TESTS"
+echo "Test suite: $TEST_SUITE"
 echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
 echo ""
 # Prepare inputs for gh workflow run
-INPUTS="-f test-subdirs='$TEST_SUBDIRS'"
+if [[ -n "$TEST_SUBDIRS" ]]; then
    INPUTS="-f test-subdirs='$TEST_SUBDIRS'"
 fi
 if [[ -n "$TEST_PROVIDER" ]]; then
    INPUTS="$INPUTS -f test-provider='$TEST_PROVIDER'"
 fi
-if [[ "$RUN_VISION_TESTS" == "true" ]]; then
+if [[ -n "$TEST_SUITE" ]]; then
-    INPUTS="$INPUTS -f run-vision-tests=true"
+    INPUTS="$INPUTS -f test-suite='$TEST_SUITE'"
 fi
 if [[ -n "$TEST_PATTERN" ]]; then
    INPUTS="$INPUTS -f test-pattern='$TEST_PATTERN'"
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -16,7 +16,7 @@ STACK_CONFIG=""
 PROVIDER=""
 TEST_SUBDIRS=""
 TEST_PATTERN=""
-RUN_VISION_TESTS="false"
+TEST_SUITE="base"
 INFERENCE_MODE="replay"
 EXTRA_PARAMS=""
@ -28,12 +28,16 @@ Usage: $0 [OPTIONS]
 Options:
    --stack-config STRING    Stack configuration to use (required)
    --provider STRING        Provider to use (ollama, vllm, etc.) (required)
-    --test-subdirs STRING    Comma-separated list of test subdirectories to run (default: 'inference')
+    --test-suite STRING      Comma-separated list of test suites to run (default: 'base')
    --run-vision-tests       Run vision tests instead of regular tests
    --inference-mode STRING  Inference mode: record or replay (default: replay)
    --test-subdirs STRING    Comma-separated list of test subdirectories to run (overrides suite)
    --test-pattern STRING    Regex pattern to pass to pytest -k
    --help                   Show this help message
 Suites are defined in tests/integration/suites.py. They are used to narrow the collection of tests and provide default model options.
 You can also specify subdirectories (of tests/integration) to select tests from, which will override the suite.
 Examples:
    # Basic inference tests with ollama
    $0 --stack-config server:ci-tests --provider ollama
@ -42,7 +46,7 @@ Examples:
    $0 --stack-config server:ci-tests --provider vllm --test-subdirs 'inference,agents'
    # Vision tests with ollama
-    $0 --stack-config server:ci-tests --provider ollama --run-vision-tests
+    $0 --stack-config server:ci-tests --provider ollama --test-suite vision
    # Record mode for updating test recordings
    $0 --stack-config server:ci-tests --provider ollama --inference-mode record
@ -64,9 +68,9 @@ while [[ $# -gt 0 ]]; do
            TEST_SUBDIRS="$2"
            shift 2
            ;;
-        --run-vision-tests)
+        --test-suite)
-            RUN_VISION_TESTS="true"
+            TEST_SUITE="$2"
-            shift
+            shift 2
            ;;
        --inference-mode)
            INFERENCE_MODE="$2"
@ -92,22 +96,25 @@ done
 # Validate required parameters
 if [[ -z "$STACK_CONFIG" ]]; then
    echo "Error: --stack-config is required"
    usage
    exit 1
 fi
 if [[ -z "$PROVIDER" ]]; then
    echo "Error: --provider is required"
-    usage
+    exit 1
 fi
 if [[ -z "$TEST_SUITE" && -z "$TEST_SUBDIRS" ]]; then
    echo "Error: --test-suite or --test-subdirs is required"
    exit 1
 fi
 echo "=== Llama Stack Integration Test Runner ==="
 echo "Stack Config: $STACK_CONFIG"
 echo "Provider: $PROVIDER"
 echo "Test Subdirs: $TEST_SUBDIRS"
 echo "Vision Tests: $RUN_VISION_TESTS"
 echo "Inference Mode: $INFERENCE_MODE"
 echo "Test Suite: $TEST_SUITE"
 echo "Test Subdirs: $TEST_SUBDIRS"
 echo "Test Pattern: $TEST_PATTERN"
 echo ""
@ -140,13 +147,6 @@ THIS_DIR=$(dirname "$0")
 ROOT_DIR="$THIS_DIR/.."
 cd $ROOT_DIR
 # Set recording directory
 if [[ "$RUN_VISION_TESTS" == "true" ]]; then
    export LLAMA_STACK_TEST_RECORDING_DIR="tests/integration/recordings/vision"
 else
    export LLAMA_STACK_TEST_RECORDING_DIR="tests/integration/recordings"
 fi
 # check if "llama" and "pytest" are available. this script does not use `uv run` given
 # it can be used in a pre-release environment where we have not been able to tell
 # uv about pre-release dependencies properly (yet).
@ -201,84 +201,46 @@ if [[ -n "$TEST_PATTERN" ]]; then
    PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"
 fi
 # Run vision tests if specified
 if [[ "$RUN_VISION_TESTS" == "true" ]]; then
    echo "Running vision tests..."
    set +e
    pytest -s -v tests/integration/inference/test_vision_inference.py \
        --stack-config="$STACK_CONFIG" \
        -k "$PYTEST_PATTERN" \
        --vision-model=ollama/llama3.2-vision:11b \
        --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
        --color=yes $EXTRA_PARAMS \
        --capture=tee-sys
    exit_code=$?
    set -e
    if [ $exit_code -eq 0 ]; then
        echo "✅ Vision tests completed successfully"
    elif [ $exit_code -eq 5 ]; then
        echo "⚠️ No vision tests collected (pattern matched no tests)"
    else
        echo "❌ Vision tests failed"
        exit 1
    fi
    exit 0
 fi
 # Run regular tests
 if [[ -z "$TEST_SUBDIRS" ]]; then
   TEST_SUBDIRS=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
            sed 's|tests/integration/||' |
            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
            sort)
 fi
 echo "Test subdirs to run: $TEST_SUBDIRS"
-# Collect all test files for the specified test types
+if [[ -n "$TEST_SUBDIRS" ]]; then
-TEST_FILES=""
+    # Collect all test files for the specified test types
-for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do
+    TEST_FILES=""
-    # Skip certain test types for vllm provider
+    for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do
-    if [[ "$PROVIDER" == "vllm" ]]; then
+        if [[ -d "tests/integration/$test_subdir" ]]; then
-        if [[ "$test_subdir" == "safety" ]] || [[ "$test_subdir" == "post_training" ]] || [[ "$test_subdir" == "tool_runtime" ]]; then
+            # Find all Python test files in this directory
-            echo "Skipping $test_subdir for vllm provider"
+            test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py")
-            continue
+            if [[ -n "$test_files" ]]; then
                TEST_FILES="$TEST_FILES $test_files"
                echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files"
            fi
        else
            echo "Warning: Directory tests/integration/$test_subdir does not exist"
        fi
    done
    if [[ -z "$TEST_FILES" ]]; then
        echo "No test files found for the specified test types"
        exit 1
    fi
-    if [[ "$STACK_CONFIG" != *"server:"* ]] && [[ "$test_subdir" == "batches" ]]; then
+    echo ""
-        echo "Skipping $test_subdir for library client until types are supported"
+    echo "=== Running all collected tests in a single pytest command ==="
-        continue
+    echo "Total test files: $(echo $TEST_FILES | wc -w)"
    fi
-    if [[ -d "tests/integration/$test_subdir" ]]; then
+    PYTEST_TARGET="$TEST_FILES"
-        # Find all Python test files in this directory
+    EXTRA_PARAMS="$EXTRA_PARAMS --text-model=$TEXT_MODEL --embedding-model=sentence-transformers/all-MiniLM-L6-v2"
-        test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py")
+else
-        if [[ -n "$test_files" ]]; then
+    PYTEST_TARGET="tests/integration/"
-            TEST_FILES="$TEST_FILES $test_files"
+    EXTRA_PARAMS="$EXTRA_PARAMS --suite=$TEST_SUITE"
            echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files"
        fi
    else
        echo "Warning: Directory tests/integration/$test_subdir does not exist"
    fi
 done
 if [[ -z "$TEST_FILES" ]]; then
    echo "No test files found for the specified test types"
    exit 1
 fi
 echo ""
 echo "=== Running all collected tests in a single pytest command ==="
 echo "Total test files: $(echo $TEST_FILES | wc -w)"
 set +e
-pytest -s -v $TEST_FILES \
+pytest -s -v $PYTEST_TARGET \
    --stack-config="$STACK_CONFIG" \
    -k "$PYTEST_PATTERN" \
-    --text-model="$TEXT_MODEL" \
+    $EXTRA_PARAMS \
-    --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
+    --color=yes \
    --color=yes $EXTRA_PARAMS \
    --capture=tee-sys
 exit_code=$?
 set -e
@ -298,5 +260,18 @@ echo "=== System Resources After Tests ==="
 free -h 2>/dev/null || echo "free command not available"
 df -h
 # stop server
 if [[ "$STACK_CONFIG" == *"server:"* ]]; then
    echo "Stopping Llama Stack Server..."
    pids=$(lsof -i :8321 | awk 'NR>1 {print $2}')
    if [[ -n "$pids" ]]; then
        echo "Killing Llama Stack Server processes: $pids"
        kill -9 $pids
    else
        echo "No Llama Stack Server processes found ?!"
    fi
    echo "Llama Stack Server stopped"
 fi
 echo ""
 echo "=== Integration Tests Complete ==="
--- a/tests/README.md
+++ b/tests/README.md
@ -38,26 +38,15 @@ For running integration tests, you must provide a few things:
  - a distribution name (e.g., `starter`) or a path to a `run.yaml` file
  - a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface.
 - Whether you are using replay or live mode for inference. This is specified with the LLAMA_STACK_TEST_INFERENCE_MODE environment variable. The default mode currently is "live" -- that is certainly surprising, but we will fix this soon.
 - Any API keys you need to use should be set in the environment, or can be passed in with the --env option.
 You can run the integration tests in replay mode with:
 ```bash
 # Run all tests with existing recordings
 LLAMA_STACK_TEST_INFERENCE_MODE=replay \
  LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
  uv run --group test \
  pytest -sv tests/integration/ --stack-config=starter
 ```
 If you don't specify LLAMA_STACK_TEST_INFERENCE_MODE, by default it will be in "live" mode -- that is, it will make real API calls.
 ```bash
 # Test against live APIs
 FIREWORKS_API_KEY=your_key pytest -sv tests/integration/inference --stack-config=starter
 ```
 ### Re-recording tests
 #### Local Re-recording (Manual Setup Required)
@ -66,7 +55,6 @@ If you want to re-record tests locally, you can do so with:
 ```bash
 LLAMA_STACK_TEST_INFERENCE_MODE=record \
  LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
  uv run --group test \
  pytest -sv tests/integration/ --stack-config=starter -k "<appropriate test name>"
 ```
@ -89,7 +77,7 @@ You must be careful when re-recording. CI workflows assume a specific setup for
 ./scripts/github/schedule-record-workflow.sh --test-subdirs "agents,inference"
 # Record with vision tests enabled
-./scripts/github/schedule-record-workflow.sh --test-subdirs "inference" --run-vision-tests
+./scripts/github/schedule-record-workflow.sh --test-suite vision
 # Record with specific provider
 ./scripts/github/schedule-record-workflow.sh --test-subdirs "agents" --test-provider vllm
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -42,6 +42,27 @@ Model parameters can be influenced by the following options:
 Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
 if no model is specified.
 ### Suites (fast selection + sane defaults)
 - `--suite`: comma-separated list of named suites that both narrow which tests are collected and prefill common model options (unless you pass them explicitly).
 - Available suites:
  - `responses`: collects tests under `tests/integration/responses`; this is a separate suite because it needs a strong tool-calling model.
  - `vision`: collects only `tests/integration/inference/test_vision_inference.py`; defaults `--vision-model=ollama/llama3.2-vision:11b`, `--embedding-model=sentence-transformers/all-MiniLM-L6-v2`.
 - Explicit flags always win. For example, `--suite=responses --text-model=<X>` overrides the suite’s text model.
 Examples:
 ```bash
 # Fast responses run with defaults
 pytest -s -v tests/integration --stack-config=server:starter --suite=responses
 # Fast single-file vision run with defaults
 pytest -s -v tests/integration --stack-config=server:starter --suite=vision
 # Combine suites and override a default
 pytest -s -v tests/integration --stack-config=server:starter --suite=responses,vision --embedding-model=text-embedding-3-small
 ```
 ## Examples
 ### Testing against a Server
@ -98,29 +119,25 @@ sentence-transformers/all-MiniLM-L6-v2
 The testing system supports three modes controlled by environment variables:
-### LIVE Mode (Default)
+### REPLAY Mode (Default)
-Tests make real API calls:
+Uses cached responses instead of making API calls:
 ```bash
-LLAMA_STACK_TEST_INFERENCE_MODE=live pytest tests/integration/
+pytest tests/integration/
 ```
 ### RECORD Mode
 Captures API interactions for later replay:
 ```bash
 LLAMA_STACK_TEST_INFERENCE_MODE=record \
 LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
 pytest tests/integration/inference/test_new_feature.py
 ```
-### REPLAY Mode
+### LIVE Mode
-Uses cached responses instead of making API calls:
+Tests make real API calls (but not recorded):
 ```bash
-LLAMA_STACK_TEST_INFERENCE_MODE=replay \
+LLAMA_STACK_TEST_INFERENCE_MODE=live pytest tests/integration/
 LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
 pytest tests/integration/
 ```
-Note that right now you must specify the recording directory. This is because different tests use different recording directories and we don't (yet) have a fool-proof way to map a test to a recording directory. We are working on this.
+By default, the recording directory is `tests/integration/recordings`. You can override this by setting the `LLAMA_STACK_TEST_RECORDING_DIR` environment variable.
 ## Managing Recordings
@ -146,7 +163,6 @@ See the [main testing guide](../README.md#remote-re-recording-recommended) for f
 ```bash
 # Re-record specific tests
 LLAMA_STACK_TEST_INFERENCE_MODE=record \
 LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
 pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py
 ```
--- a/tests/integration/batches/test_batches.py
+++ b/tests/integration/batches/test_batches.py
@ -268,3 +268,58 @@ class TestBatchesIntegration:
        deleted_error_file = openai_client.files.delete(final_batch.error_file_id)
        assert deleted_error_file.deleted, f"Error file {final_batch.error_file_id} was not deleted successfully"
    def test_batch_e2e_completions(self, openai_client, batch_helper, text_model_id):
        """Run an end-to-end batch with a single successful text completion request."""
        request_body = {"model": text_model_id, "prompt": "Say completions", "max_tokens": 20}
        batch_requests = [
            {
                "custom_id": "success-1",
                "method": "POST",
                "url": "/v1/completions",
                "body": request_body,
            }
        ]
        with batch_helper.create_file(batch_requests) as uploaded_file:
            batch = openai_client.batches.create(
                input_file_id=uploaded_file.id,
                endpoint="/v1/completions",
                completion_window="24h",
                metadata={"test": "e2e_completions_success"},
            )
            final_batch = batch_helper.wait_for(
                batch.id,
                max_wait_time=3 * 60,
                expected_statuses={"completed"},
                timeout_action="skip",
            )
        assert final_batch.status == "completed"
        assert final_batch.request_counts is not None
        assert final_batch.request_counts.total == 1
        assert final_batch.request_counts.completed == 1
        assert final_batch.output_file_id is not None
        output_content = openai_client.files.content(final_batch.output_file_id)
        if isinstance(output_content, str):
            output_text = output_content
        else:
            output_text = output_content.content.decode("utf-8")
        output_lines = output_text.strip().split("\n")
        assert len(output_lines) == 1
        result = json.loads(output_lines[0])
        assert result["custom_id"] == "success-1"
        assert "response" in result
        assert result["response"]["status_code"] == 200
        deleted_output_file = openai_client.files.delete(final_batch.output_file_id)
        assert deleted_output_file.deleted
        if final_batch.error_file_id is not None:
            deleted_error_file = openai_client.files.delete(final_batch.error_file_id)
            assert deleted_error_file.deleted
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -6,15 +6,17 @@
 import inspect
 import itertools
 import os
 import platform
 import textwrap
 import time
 from pathlib import Path
 import pytest
 from dotenv import load_dotenv
 from llama_stack.log import get_logger
 from .suites import SUITE_DEFINITIONS
 logger = get_logger(__name__, category="tests")
@ -30,6 +32,8 @@ def pytest_runtest_makereport(item, call):
 def pytest_sessionstart(session):
    # stop macOS from complaining about duplicate OpenMP libraries
    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
    if "LLAMA_STACK_TEST_INFERENCE_MODE" not in os.environ:
        os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = "replay"
 def pytest_runtest_teardown(item):
@ -59,9 +63,22 @@ def pytest_configure(config):
        key, value = env_var.split("=", 1)
        os.environ[key] = value
-    if platform.system() == "Darwin":  # Darwin is the system name for macOS
+    suites_raw = config.getoption("--suite")
-        os.environ["DISABLE_CODE_SANDBOX"] = "1"
+    suites: list[str] = []
-        logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")
+    if suites_raw:
        suites = [p.strip() for p in str(suites_raw).split(",") if p.strip()]
        unknown = [p for p in suites if p not in SUITE_DEFINITIONS]
        if unknown:
            raise pytest.UsageError(
                f"Unknown suite(s): {', '.join(unknown)}. Available: {', '.join(sorted(SUITE_DEFINITIONS.keys()))}"
            )
    for suite in suites:
        suite_def = SUITE_DEFINITIONS.get(suite, {})
        defaults: dict = suite_def.get("defaults", {})
        for dest, value in defaults.items():
            current = getattr(config.option, dest, None)
            if not current:
                setattr(config.option, dest, value)
 def pytest_addoption(parser):
@ -103,16 +120,21 @@ def pytest_addoption(parser):
        default=384,
        help="Output dimensionality of the embedding model to use for testing. Default: 384",
    )
    parser.addoption(
        "--record-responses",
        action="store_true",
        help="Record new API responses instead of using cached ones.",
    )
    parser.addoption(
        "--report",
        help="Path where the test report should be written, e.g. --report=/path/to/report.md",
    )
    available_suites = ", ".join(sorted(SUITE_DEFINITIONS.keys()))
    suite_help = (
        "Comma-separated integration test suites to narrow collection and prefill defaults. "
        "Available: "
        f"{available_suites}. "
        "Explicit CLI flags (e.g., --text-model) override suite defaults. "
        "Examples: --suite=responses or --suite=responses,vision."
    )
    parser.addoption("--suite", help=suite_help)
 MODEL_SHORT_IDS = {
    "meta-llama/Llama-3.2-3B-Instruct": "3B",
@ -195,3 +217,40 @@ def pytest_generate_tests(metafunc):
 pytest_plugins = ["tests.integration.fixtures.common"]
 def pytest_ignore_collect(path: str, config: pytest.Config) -> bool:
    """Skip collecting paths outside the selected suite roots for speed."""
    suites_raw = config.getoption("--suite")
    if not suites_raw:
        return False
    names = [p.strip() for p in str(suites_raw).split(",") if p.strip()]
    roots: list[str] = []
    for name in names:
        suite_def = SUITE_DEFINITIONS.get(name)
        if suite_def:
            roots.extend(suite_def.get("roots", []))
    if not roots:
        return False
    p = Path(str(path)).resolve()
    # Only constrain within tests/integration to avoid ignoring unrelated tests
    integration_root = (Path(str(config.rootpath)) / "tests" / "integration").resolve()
    if not p.is_relative_to(integration_root):
        return False
    for r in roots:
        rp = (Path(str(config.rootpath)) / r).resolve()
        if rp.is_file():
            # Allow the exact file and any ancestor directories so pytest can walk into it.
            if p == rp:
                return False
            if p.is_dir() and rp.is_relative_to(p):
                return False
        else:
            # Allow anything inside an allowed directory
            if p.is_relative_to(rp):
                return False
    return True
--- a/tests/integration/files/test_files.py
+++ b/tests/integration/files/test_files.py
@ -8,6 +8,7 @@ from io import BytesIO
 from unittest.mock import patch
 import pytest
 import requests
 from llama_stack.core.datatypes import User
@ -79,6 +80,88 @@ def test_openai_client_basic_operations(openai_client):
                pass  # ignore 404
@pytest.mark.xfail(message="expires_after not available on all providers")
 def test_expires_after(openai_client):
    """Test uploading a file with expires_after parameter."""
    client = openai_client
    uploaded_file = None
    try:
        with BytesIO(b"expires_after test") as file_buffer:
            file_buffer.name = "expires_after.txt"
            uploaded_file = client.files.create(
                file=file_buffer,
                purpose="assistants",
                expires_after={"anchor": "created_at", "seconds": 4545},
            )
        assert uploaded_file.expires_at is not None
        assert uploaded_file.expires_at == uploaded_file.created_at + 4545
        listed = client.files.list()
        ids = [f.id for f in listed.data]
        assert uploaded_file.id in ids
        retrieved = client.files.retrieve(uploaded_file.id)
        assert retrieved.id == uploaded_file.id
    finally:
        if uploaded_file is not None:
            try:
                client.files.delete(uploaded_file.id)
            except Exception:
                pass
@pytest.mark.xfail(message="expires_after not available on all providers")
 def test_expires_after_requests(openai_client):
    """Upload a file using requests multipart/form-data and bracketed expires_after fields.
    This ensures clients that send form fields like `expires_after[anchor]` and
    `expires_after[seconds]` are handled by the server.
    """
    base_url = f"{openai_client.base_url}files"
    uploaded_id = None
    try:
        files = {"file": ("expires_after_with_requests.txt", BytesIO(b"expires_after via requests"))}
        data = {
            "purpose": "assistants",
            "expires_after[anchor]": "created_at",
            "expires_after[seconds]": "4545",
        }
        session = requests.Session()
        request = requests.Request("POST", base_url, files=files, data=data)
        prepared = session.prepare_request(request)
        resp = session.send(prepared, timeout=30)
        resp.raise_for_status()
        result = resp.json()
        assert result.get("id", "").startswith("file-")
        uploaded_id = result["id"]
        assert result.get("created_at") is not None
        assert result.get("expires_at") == result["created_at"] + 4545
        list_resp = requests.get(base_url, timeout=30)
        list_resp.raise_for_status()
        listed = list_resp.json()
        ids = [f["id"] for f in listed.get("data", [])]
        assert uploaded_id in ids
        retrieve_resp = requests.get(f"{base_url}/{uploaded_id}", timeout=30)
        retrieve_resp.raise_for_status()
        retrieved = retrieve_resp.json()
        assert retrieved["id"] == uploaded_id
    finally:
        if uploaded_id:
            try:
                requests.delete(f"{base_url}/{uploaded_id}", timeout=30)
            except Exception:
                pass
@pytest.mark.xfail(message="User isolation broken for current providers, must be fixed.")
@patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
 def test_files_authentication_isolation(mock_get_authenticated_user, llama_stack_client):
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -5,6 +5,8 @@
 # the root directory of this source tree.
 import time
 import pytest
 from ..test_cases.test_case import TestCase
@ -35,6 +37,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "remote::sambanova",
        "remote::tgi",
        "remote::vertexai",
        # {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos,
        # or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}
        "remote::groq",
        "remote::gemini",  # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
@ -56,6 +62,21 @@ def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
        pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")
 def skip_if_doesnt_support_n(client_with_models, model_id):
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type in (
        "remote::sambanova",
        "remote::ollama",
        # https://console.groq.com/docs/openai#currently-unsupported-openai-features
        # -> Error code: 400 - {'error': {'message': "'n' : number must be at most 1", 'type': 'invalid_request_error'}}
        "remote::groq",
        # Error code: 400 - [{'error': {'code': 400, 'message': 'Only one candidate can be specified in the
        # current model', 'status': 'INVALID_ARGUMENT'}}]
        "remote::gemini",
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")
 def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type in (
@ -260,10 +281,7 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
 )
 def test_openai_chat_completion_streaming_with_n(compat_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
-
+    skip_if_doesnt_support_n(client_with_models, text_model_id)
    provider = provider_from_model(client_with_models, text_model_id)
    if provider.provider_type == "remote::ollama":
        pytest.skip(f"Model {text_model_id} hosted by {provider.provider_type} doesn't support n > 1.")
    tc = TestCase(test_case)
    question = tc["question"]
@ -323,8 +341,15 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
        response_id = response.id
        content = response.choices[0].message.content
-    responses = client.chat.completions.list(limit=1000)
+    tries = 0
-    assert response_id in [r.id for r in responses.data]
+    while tries < 10:
        responses = client.chat.completions.list(limit=1000)
        if response_id in [r.id for r in responses.data]:
            break
        else:
            tries += 1
            time.sleep(0.1)
    assert tries < 10, f"Response {response_id} not found after 1 second"
    retrieved_response = client.chat.completions.retrieve(response_id)
    assert retrieved_response.id == response_id
@ -388,6 +413,18 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
        response_id = response.id
        content = response.choices[0].message.content
    # wait for the response to be stored
    tries = 0
    while tries < 10:
        responses = client.chat.completions.list(limit=1000)
        if response_id in [r.id for r in responses.data]:
            break
        else:
            tries += 1
            time.sleep(0.1)
    assert tries < 10, f"Response {response_id} not found after 1 second"
    responses = client.chat.completions.list(limit=1000)
    assert response_id in [r.id for r in responses.data]
--- a/tests/integration/non_ci/responses/fixtures/init.py
+++ b/tests/integration/non_ci/responses/fixtures/init.py
@ -1,5 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/tests/integration/recordings/responses/00ba04f74a96.json
+++ b/tests/integration/recordings/responses/00ba04f74a96.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:53.860911Z",
+        "created_at": "2025-09-03T17:37:35.23084Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 249137667,
+        "total_duration": 195981375,
-        "load_duration": 152509542,
+        "load_duration": 110522917,
        "prompt_eval_count": 216,
-        "prompt_eval_duration": 71000000,
+        "prompt_eval_duration": 72393958,
        "eval_count": 2,
-        "eval_duration": 24000000,
+        "eval_duration": 11843000,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/04172112ffbb.json
+++ b/tests/integration/recordings/responses/04172112ffbb.json
@ -21,7 +21,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.033900164Z",
+          "created_at": "2025-09-03T17:41:43.950283Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -39,7 +39,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.213371151Z",
+          "created_at": "2025-09-03T17:41:43.991122Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -57,7 +57,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.387513976Z",
+          "created_at": "2025-09-03T17:41:44.031378Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -75,7 +75,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.564344287Z",
+          "created_at": "2025-09-03T17:41:44.073098Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -93,7 +93,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.746579415Z",
+          "created_at": "2025-09-03T17:41:44.115961Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -111,7 +111,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.923276047Z",
+          "created_at": "2025-09-03T17:41:44.156517Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -129,7 +129,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.099961963Z",
+          "created_at": "2025-09-03T17:41:44.197079Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -147,7 +147,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.275621884Z",
+          "created_at": "2025-09-03T17:41:44.237565Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -165,7 +165,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.452204196Z",
+          "created_at": "2025-09-03T17:41:44.277755Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -183,7 +183,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.626937514Z",
+          "created_at": "2025-09-03T17:41:44.318476Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -201,7 +201,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.805566767Z",
+          "created_at": "2025-09-03T17:41:44.358628Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -219,7 +219,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.985987477Z",
+          "created_at": "2025-09-03T17:41:44.398984Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -237,7 +237,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.166458601Z",
+          "created_at": "2025-09-03T17:41:44.439232Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -255,7 +255,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.343346795Z",
+          "created_at": "2025-09-03T17:41:44.479478Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -273,7 +273,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.525008091Z",
+          "created_at": "2025-09-03T17:41:44.520202Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -291,7 +291,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.709087695Z",
+          "created_at": "2025-09-03T17:41:44.560517Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -309,7 +309,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.887074305Z",
+          "created_at": "2025-09-03T17:41:44.601592Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -327,15 +327,15 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:21.065244925Z",
+          "created_at": "2025-09-03T17:41:44.642064Z",
          "done": true,
          "done_reason": "stop",
-          "total_duration": 4373531496,
+          "total_duration": 887142667,
-          "load_duration": 44438132,
+          "load_duration": 119331417,
          "prompt_eval_count": 56,
-          "prompt_eval_duration": 1296273199,
+          "prompt_eval_duration": 74294709,
          "eval_count": 18,
-          "eval_duration": 3032321735,
+          "eval_duration": 692842791,
          "response": "",
          "thinking": null,
          "context": null
--- a/tests/integration/recordings/responses/0b27fd737699.json
+++ b/tests/integration/recordings/responses/0b27fd737699.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:13:57.556416Z",
+        "created_at": "2025-09-03T17:37:47.461886Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 432363250,
+        "total_duration": 338927833,
-        "load_duration": 159296417,
+        "load_duration": 100895125,
        "prompt_eval_count": 223,
-        "prompt_eval_duration": 257000000,
+        "prompt_eval_duration": 221583042,
        "eval_count": 2,
-        "eval_duration": 14000000,
+        "eval_duration": 12341416,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/0b3f2e4754ff.json
+++ b/tests/integration/recordings/responses/0b3f2e4754ff.json
@ -24,7 +24,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -39,7 +39,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -50,7 +50,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -65,7 +65,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -76,7 +76,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -91,7 +91,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -102,7 +102,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -117,7 +117,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -128,7 +128,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -143,7 +143,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -154,7 +154,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -169,7 +169,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -180,7 +180,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -195,7 +195,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -206,7 +206,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -221,7 +221,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
--- a/tests/integration/recordings/responses/0e8f2b001dd9.json
+++ b/tests/integration/recordings/responses/0e8f2b001dd9.json
@ -1,7 +1,7 @@
 {
  "request": {
    "method": "POST",
-    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
@ -20,14 +20,14 @@
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
-        "id": "chatcmpl-368",
+        "id": "chatcmpl-161",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
-              "content": "Saturn is known for its extensive ring system.",
+              "content": "The answer is Saturn.",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
@ -37,15 +37,15 @@
            }
          }
        ],
-        "created": 1754081853,
+        "created": 1756921364,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
-          "completion_tokens": 11,
+          "completion_tokens": 6,
          "prompt_tokens": 39,
-          "total_tokens": 50,
+          "total_tokens": 45,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
--- a/tests/integration/recordings/responses/10eea8c15ddc.json
+++ b/tests/integration/recordings/responses/10eea8c15ddc.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:51.682357Z",
+        "created_at": "2025-09-03T17:37:33.473237Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 238161000,
+        "total_duration": 279025042,
-        "load_duration": 72494750,
+        "load_duration": 162673250,
        "prompt_eval_count": 212,
-        "prompt_eval_duration": 87000000,
+        "prompt_eval_duration": 73595834,
        "eval_count": 5,
-        "eval_duration": 74000000,
+        "eval_duration": 41950291,
        "response": "unsafe\nS8",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/140187e305dc.json
+++ b/tests/integration/recordings/responses/140187e305dc.json
@ -1,7 +1,7 @@
 {
  "request": {
    "method": "POST",
-    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
@ -20,14 +20,14 @@
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
-        "id": "chatcmpl-876",
+        "id": "chatcmpl-974",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
-              "content": "I'm afraid I don't have a built-in ability to directly interface with or \"test\" OpenAI models, including the original GPT-1 model. However, I can explain how you might approach this task:\n\nThe OpenAI GPT-1 is a large transformer-based language model that was trained on a massive dataset of text and achieved state-of-the-art results in various natural language processing tasks.\n\nTo test or evaluate the performance of a model like GPT-1, you would typically follow these steps:\n\n1. **Get access to the OpenAI API**: The OpenAI API provides a way for developers to interact with the GPT-1 model programmatically. You can sign up for an API key on the OpenAI website.\n2. **Choose a testing platform or environment**: You'll need a compute platform that supports the necessary algorithms and data structures to run inference on the GPT-1 model. Some popular options include AWS, Google Cloud, or Azure Compute Virtual Machines.\n3. **Prepare your test input data**: This will involve creating text inputs in the format expected by the OpenAI API (i.e., a JSON object containing the text to be processed).\n4. **Use the OpenAI Python library or SDK**: The OpenAI Python library provides an easy-to-use interface for interacting with the GPT-1 model through the API.\n\nHere's some example code that demonstrates how you might use the OpenAI Flask API to test a single input:\n\n```python\nfrom flask import Flask, request, jsonify\nimport json\n\napp = Flask(__name__)\n\n@ app . route ( '/ /gpt-en ', ' Text ', methods = ['POST'])\ndef gpt_en () -> Json :\n    data = request . get_json ()\n    if not data or \"message\" in ( data ):\n        return None , 400 , { ' error' : \"Input must be a text string.\" }\n    response = []\n    while True:\n        message = \"\"\n        for token in data [\"input\"]:\n            response_text = f\"{data['prompt']} {token}\"\n            data[\"input\"] = [response_text]\n            new_response = gpt_en()(data)\n            if all([not item or not isinstance(item, dict) for item in new_response]):\n             break\n\n        message = json . dumps ({}\"text\": response_text})\n        response.append(message)\n\n    return jsonify ({\"output\": response}), 200 , {}\n\nif __name__ == \"__main__\":\n   app.run(debug=True)\n```\n\n5. **Evaluate the output**: Once you have processed your test input data using the GPT-1 model, you can evaluate the accuracy of the generated responses.\n\nKeep in mind that this is just a basic example to illustrate how you might approach testing the OpenAI GPT-1 model.",
+              "content": "I'm happy to help you test the OpenAI API, however I can not access the API.\n\nInstead why don't we follow these steps:\n\n*   Check documentation\n*   Contact support\n*   Reach out to their community forum. \n\nLet me know if I can be of any additional assistance",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
@ -37,15 +37,15 @@
            }
          }
        ],
-        "created": 1754510050,
+        "created": 1756921202,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
-          "completion_tokens": 567,
+          "completion_tokens": 61,
          "prompt_tokens": 31,
-          "total_tokens": 598,
+          "total_tokens": 92,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
--- a/tests/integration/recordings/responses/17253d7cc667.json
+++ b/tests/integration/recordings/responses/17253d7cc667.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:52.919624Z",
+        "created_at": "2025-09-03T17:37:34.308033Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 201956834,
+        "total_duration": 200296000,
-        "load_duration": 105132584,
+        "load_duration": 115974708,
        "prompt_eval_count": 212,
-        "prompt_eval_duration": 75000000,
+        "prompt_eval_duration": 72173459,
        "eval_count": 2,
-        "eval_duration": 20000000,
+        "eval_duration": 11536750,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/173ecb3aab28.json
+++ b/tests/integration/recordings/responses/173ecb3aab28.json
@ -40,7 +40,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -55,7 +55,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -66,7 +66,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -81,7 +81,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -92,7 +92,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -107,7 +107,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -118,7 +118,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -133,7 +133,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -144,7 +144,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -159,7 +159,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -170,7 +170,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -185,7 +185,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -196,7 +196,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -211,7 +211,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -222,7 +222,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -237,7 +237,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
--- a/tests/integration/recordings/responses/174458ad71b2.json
+++ b/tests/integration/recordings/responses/174458ad71b2.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:53.580806Z",
+        "created_at": "2025-09-03T17:37:34.994704Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 205732750,
+        "total_duration": 339570875,
-        "load_duration": 98967000,
+        "load_duration": 262794125,
        "prompt_eval_count": 213,
-        "prompt_eval_duration": 86000000,
+        "prompt_eval_duration": 64061000,
        "eval_count": 2,
-        "eval_duration": 18000000,
+        "eval_duration": 11839042,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/178016edef0e.json
+++ b/tests/integration/recordings/responses/178016edef0e.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:52.354566Z",
+        "created_at": "2025-09-03T17:37:33.769233Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 605192500,
+        "total_duration": 253836584,
-        "load_duration": 457087166,
+        "load_duration": 138624959,
        "prompt_eval_count": 210,
-        "prompt_eval_duration": 63000000,
+        "prompt_eval_duration": 69496125,
        "eval_count": 5,
-        "eval_duration": 84000000,
+        "eval_duration": 45062833,
        "response": "unsafe\nS12",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/197228e26971.json
+++ b/tests/integration/recordings/responses/197228e26971.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:52.686478Z",
+        "created_at": "2025-09-03T17:37:34.074233Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 304136208,
+        "total_duration": 270746375,
-        "load_duration": 155977000,
+        "load_duration": 156423042,
        "prompt_eval_count": 213,
-        "prompt_eval_duration": 71000000,
+        "prompt_eval_duration": 70338083,
        "eval_count": 5,
-        "eval_duration": 76000000,
+        "eval_duration": 43379167,
        "response": "unsafe\nS2",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/198ef7208389.json
+++ b/tests/integration/recordings/responses/198ef7208389.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:51.186501Z",
+        "created_at": "2025-09-03T17:37:32.84197Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 3146184459,
+        "total_duration": 21572898667,
-        "load_duration": 2533467917,
+        "load_duration": 21155275042,
        "prompt_eval_count": 212,
-        "prompt_eval_duration": 526000000,
+        "prompt_eval_duration": 371898125,
        "eval_count": 5,
-        "eval_duration": 83000000,
+        "eval_duration": 43290458,
        "response": "unsafe\nS1",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/1adfaa0e062e.json
+++ b/tests/integration/recordings/responses/1adfaa0e062e.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:53.332041Z",
+        "created_at": "2025-09-03T17:37:34.607413Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 365895333,
+        "total_duration": 267812042,
-        "load_duration": 257825208,
+        "load_duration": 181570000,
        "prompt_eval_count": 213,
-        "prompt_eval_duration": 78000000,
+        "prompt_eval_duration": 73947375,
        "eval_count": 2,
-        "eval_duration": 28000000,
+        "eval_duration": 11708000,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/1b8394f90636.json
+++ b/tests/integration/recordings/responses/1b8394f90636.json
@ -22,15 +22,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama3.2:3b-instruct-fp16",
-        "created_at": "2025-08-04T22:55:05.685988Z",
+        "created_at": "2025-09-03T17:36:13.821929Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 14128980625,
+        "total_duration": 1907912167,
-        "load_duration": 7220159208,
+        "load_duration": 90979292,
        "prompt_eval_count": 18,
-        "prompt_eval_duration": 4658000000,
+        "prompt_eval_duration": 77350291,
        "eval_count": 43,
-        "eval_duration": 2224000000,
+        "eval_duration": 1738568334,
        "response": " _______.\n\nThe best answer is blue. The traditional nursery rhyme goes like this:\n\nRoses are red,\nViolets are blue,\nSugar is sweet,\nAnd so are you! (Or something similar.)",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/1b92be674e2a.json
+++ b/tests/integration/recordings/responses/1b92be674e2a.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama3.2:3b-instruct-fp16",
-        "created_at": "2025-07-31T17:50:06.140190726Z",
+        "created_at": "2025-09-03T17:39:38.236797Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 5213341378,
+        "total_duration": 1296281500,
-        "load_duration": 43943569,
+        "load_duration": 283393917,
        "prompt_eval_count": 23,
-        "prompt_eval_duration": 1049424427,
+        "prompt_eval_duration": 75453042,
        "eval_count": 24,
-        "eval_duration": 4119422888,
+        "eval_duration": 936860125,
        "response": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/1e11c2b20ff8.json
+++ b/tests/integration/recordings/responses/1e11c2b20ff8.json
@ -0,0 +1,422 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/embeddings",
    "headers": {},
    "body": {
      "model": "all-minilm:l6-v2",
      "input": [
        "How do systems learn automatically?"
      ],
      "encoding_format": "float"
    },
    "endpoint": "/v1/embeddings",
    "model": "all-minilm:l6-v2"
  },
  "response": {
    "body": {
      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
      "__data__": {
        "data": [
          {
            "embedding": [
              0.042460807,
              -0.06189971,
              -0.0784711,
              0.0064329687,
              0.03129365,
              0.00807445,
              0.05801836,
              0.025447326,
              0.016402787,
              0.045995634,
              -0.028924342,
              0.04451832,
              0.05686613,
              -0.015340794,
              -0.07020505,
              -0.057178136,
              -0.07683263,
              0.006748679,
              0.0043323045,
              -0.123651944,
              0.0031534543,
              -0.03258051,
              -0.02936216,
              0.024140852,
              -0.028559243,
              0.10224467,
              0.0021632623,
              -0.006975691,
              0.025292527,
              -0.055500276,
              0.031231727,
              -0.0070274337,
              0.08430815,
              -0.028431177,
              -0.083029,
              0.009555893,
              -0.020029299,
              -0.00243229,
              -0.00768719,
              -0.023077851,
              -0.09293533,
              -0.042625993,
              -0.020000124,
              0.008240663,
              0.060970567,
              0.050315727,
              -0.0510085,
              -0.008543903,
              -0.030227834,
              -0.03582846,
              -0.17836656,
              -0.047279052,
              0.033892106,
              0.031623542,
              -0.008832113,
              0.10480918,
              0.033559043,
              0.090348184,
              -0.015757555,
              -0.0125672715,
              -0.084686965,
              -0.114781834,
              -0.13755985,
              0.021652374,
              0.047834594,
              0.043243896,
              0.008659893,
              0.038724966,
              0.046716973,
              -0.077413626,
              -0.04887495,
              0.031287406,
              0.022356613,
              0.00043283988,
              0.052321073,
              -0.012254071,
              -0.035172574,
              -0.00825216,
              -0.008866574,
              -0.034267236,
              -0.04576201,
              0.002467568,
              -0.040877618,
              0.08047682,
              0.09472728,
              0.0413438,
              0.0057974122,
              0.044982508,
              0.025369909,
              0.006618073,
              0.010467276,
              -0.07960384,
              -0.03108485,
              -0.03528749,
              0.01831391,
              0.053473305,
              0.06568304,
              -0.07259002,
              0.02523736,
              0.10520362,
              0.035732146,
              0.028157586,
              0.011687256,
              0.044207197,
              0.012604437,
              0.0018819098,
              0.03926183,
              0.043135095,
              0.09784739,
              -0.08801336,
              -0.06060836,
              0.02681984,
              0.0041358666,
              0.033492945,
              0.011799116,
              0.009551661,
              -0.0095491735,
              -0.021212189,
              -0.008917248,
              0.029352615,
              -0.012693442,
              -0.019269384,
              0.009901157,
              -0.00812101,
              0.018603146,
              -0.0007501193,
              -0.056115113,
              -3.8018077e-33,
              0.020848714,
              0.0047160466,
              0.019726405,
              0.06024251,
              -0.0685974,
              -0.07497267,
              0.007997452,
              -0.047339544,
              0.057801835,
              0.049544968,
              0.01878086,
              0.03274472,
              0.017663997,
              0.07483022,
              0.02496901,
              -0.011843339,
              -0.11212756,
              0.0070379525,
              0.028099466,
              -0.01746246,
              0.08173482,
              -0.007920462,
              0.032095373,
              -0.12300146,
              0.033773854,
              0.025873141,
              -0.0045020077,
              0.079493225,
              0.0040725255,
              0.03305898,
              0.008061117,
              0.0134422695,
              -0.03292251,
              0.031554114,
              0.04013794,
              0.0014983519,
              0.030762345,
              0.029481992,
              0.041350223,
              -0.047438618,
              0.03944708,
              -0.07526981,
              0.037927423,
              -0.026016014,
              0.016933467,
              0.0136799775,
              0.0071263947,
              -0.05386736,
              -0.07443268,
              -0.006070775,
              0.024427462,
              -0.039844982,
              -0.020661902,
              -0.033354662,
              0.009005565,
              0.12111172,
              -0.028260944,
              -0.036192853,
              -0.021332363,
              0.05333571,
              0.05161245,
              -0.01204843,
              0.035563566,
              0.05408247,
              0.060722187,
              0.07159865,
              0.04299143,
              0.008544481,
              0.07421879,
              0.00841512,
              -0.036342908,
              -0.008549791,
              -0.08816386,
              -0.049075164,
              0.00029373015,
              -0.05127952,
              0.03586739,
              -0.030380003,
              -0.012642127,
              0.018771531,
              0.01711824,
              -0.06644723,
              0.023793438,
              0.0010271219,
              -0.01939443,
              -0.053452212,
              -0.017060323,
              -0.062207118,
              -0.05962535,
              -0.012172617,
              -0.013190802,
              -0.037036054,
              0.00082622556,
              0.098088354,
              0.024690514,
              2.1767905e-33,
              -0.010088812,
              -0.016811697,
              -0.042140447,
              0.08837209,
              -0.028899776,
              -0.0048947735,
              -0.082139015,
              0.029238816,
              -0.043079354,
              -0.014153092,
              -0.028387645,
              0.025998218,
              -0.017625,
              0.046511114,
              -0.005768211,
              0.030010609,
              0.011375536,
              0.017426634,
              0.055062976,
              0.032230247,
              -0.07995765,
              0.032486655,
              -0.060016844,
              -0.011561194,
              0.010211269,
              0.046528235,
              0.001191399,
              0.0786961,
              -0.0446158,
              0.032789085,
              0.0023115936,
              -0.03886269,
              -0.017663589,
              0.07913024,
              -0.004583343,
              0.043521065,
              -0.031589273,
              0.008867868,
              -0.05013296,
              0.068929516,
              0.043675046,
              0.019968731,
              -0.08471742,
              -0.046864275,
              -0.0068198936,
              -0.026138468,
              -0.05107216,
              0.054374695,
              0.03069186,
              -0.010925094,
              0.04721093,
              -0.017387696,
              -0.020754937,
              -0.081763394,
              -0.027709637,
              0.035980806,
              0.05396534,
              0.044874854,
              0.059699643,
              0.041227758,
              -0.06664364,
              -0.09201654,
              0.008915574,
              0.025849758,
              -0.038651932,
              -0.0044070315,
              -0.052066546,
              0.027435115,
              0.012089562,
              0.048306923,
              0.059854515,
              0.097325735,
              -0.053612895,
              -0.07639326,
              0.015773866,
              -0.0444848,
              -0.13214406,
              -0.0702488,
              -0.10134438,
              -0.11905995,
              -0.027714504,
              0.006891868,
              -0.0053650527,
              0.054135524,
              -0.111159205,
              0.07835098,
              0.03506018,
              0.016036613,
              0.021490784,
              -0.061526407,
              0.007425222,
              0.04833579,
              -0.01361202,
              0.012450488,
              -0.12729599,
              -1.4009424e-08,
              -0.040908325,
              -0.01596458,
              0.060048707,
              0.03804525,
              0.0663794,
              0.04727275,
              -0.016112225,
              0.09687414,
              -0.04424251,
              -0.028799534,
              -0.01294642,
              0.013026413,
              0.022404836,
              0.04713173,
              0.06402557,
              0.12130648,
              0.06062839,
              0.10218965,
              -0.0757528,
              -0.023806982,
              0.12489501,
              -0.045460615,
              0.09545599,
              0.021262301,
              0.03731495,
              -0.075220875,
              -0.0026194793,
              0.0472452,
              0.048499025,
              0.12358729,
              0.017998053,
              0.013811017,
              -0.035893846,
              -0.051789004,
              0.06182457,
              0.05160056,
              0.008895317,
              -0.12500942,
              0.016453298,
              -0.08590811,
              -0.071096726,
              0.06987216,
              -0.036072273,
              -0.0053715096,
              -0.048762616,
              0.00081640907,
              -0.021502526,
              -0.061078615,
              0.002485032,
              -0.032720752,
              0.045743283,
              0.038934175,
              -0.024666062,
              0.025897244,
              0.10301431,
              -0.013001504,
              0.04783332,
              -0.07114252,
              0.046031926,
              0.080549754,
              -0.10302451,
              0.08449227,
              0.028010191,
              -0.03697792
            ],
            "index": 0,
            "object": "embedding"
          }
        ],
        "model": "all-minilm:l6-v2",
        "object": "list",
        "usage": {
          "prompt_tokens": 6,
          "total_tokens": 6
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/211b1562d4e6.json
+++ b/tests/integration/recordings/responses/211b1562d4e6.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama3.2:3b-instruct-fp16",
-        "created_at": "2025-08-04T22:55:11.15982Z",
+        "created_at": "2025-09-03T17:36:17.894986Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 498612042,
+        "total_duration": 363397458,
-        "load_duration": 71411834,
+        "load_duration": 86692791,
        "prompt_eval_count": 23,
-        "prompt_eval_duration": 102000000,
+        "prompt_eval_duration": 68658541,
        "eval_count": 6,
-        "eval_duration": 323000000,
+        "eval_duration": 207389084,
        "response": "Humans live on Earth.",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/23506e73bb9e.json
+++ b/tests/integration/recordings/responses/23506e73bb9e.json
@ -0,0 +1,422 @@
 {
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/embeddings",
    "headers": {},
    "body": {
      "model": "all-minilm:l6-v2",
      "input": [
        "This is a test file 1"
      ],
      "encoding_format": "float"
    },
    "endpoint": "/v1/embeddings",
    "model": "all-minilm:l6-v2"
  },
  "response": {
    "body": {
      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
      "__data__": {
        "data": [
          {
            "embedding": [
              -0.055990793,
              0.076004684,
              -0.09247725,
              0.014340361,
              0.058780864,
              -0.032434482,
              0.020954052,
              0.028818125,
              -0.06591213,
              0.013541593,
              0.12999941,
              0.004603084,
              -0.0069239275,
              -0.055457443,
              -0.047553156,
              -0.029139794,
              -0.12236376,
              -0.05360872,
              -0.014706594,
              0.05984688,
              0.034442738,
              0.02076038,
              -0.048697792,
              0.0135388365,
              0.058592733,
              -0.003076384,
              -0.031565297,
              0.082541116,
              -0.031259205,
              -0.12057633,
              0.038319625,
              0.06574785,
              0.06415721,
              0.038382582,
              0.12570712,
              0.03108174,
              0.10821103,
              -0.0019794356,
              -0.024704305,
              0.028765837,
              0.01268161,
              -0.039844505,
              0.043253522,
              -0.015898596,
              -0.0135526005,
              -0.0050831717,
              -0.007911988,
              0.039783813,
              0.0036548872,
              -0.033632487,
              -0.058547974,
              0.0048877494,
              -0.089586094,
              -0.010457663,
              0.059202507,
              -0.020414542,
              0.014278556,
              0.013986488,
              -0.0046022516,
              0.0383391,
              0.0048145773,
              0.029772853,
              -0.020863408,
              0.018640704,
              0.12422993,
              -0.023236223,
              -0.040323637,
              -0.023598222,
              -0.007448043,
              -0.09083128,
              -0.16859712,
              0.01012451,
              -0.035808884,
              0.010595173,
              -0.02050494,
              0.0020821376,
              -0.10925222,
              0.00793264,
              0.048889533,
              -0.11391199,
              -0.06072707,
              -0.13435508,
              0.0063265716,
              -0.008838073,
              -0.03153269,
              0.099169336,
              0.055310693,
              0.0068571265,
              -0.023463152,
              -0.0031599961,
              0.036782328,
              0.014336826,
              0.022220163,
              0.047114056,
              0.007079763,
              0.06806425,
              0.01851431,
              0.040882625,
              0.055058856,
              0.09488346,
              -0.015833577,
              -7.924328e-05,
              0.010821554,
              0.09177704,
              -0.07464829,
              -0.06471165,
              0.07013805,
              -0.04499751,
              0.057702336,
              -0.0260911,
              0.006323043,
              -0.09500501,
              -0.010549514,
              -0.07887475,
              0.039744847,
              -0.04154404,
              -0.055268157,
              0.07540271,
              -0.04667509,
              0.036143072,
              0.080297194,
              -0.036381353,
              -0.03477274,
              0.01701203,
              -0.047007203,
              -0.06519774,
              0.062141683,
              -4.222482e-33,
              -0.0017580023,
              -0.09383388,
              -0.02982657,
              0.1257841,
              0.03802007,
              -0.03654342,
              0.0060920226,
              0.05906885,
              -0.11074452,
              0.005664566,
              -0.0259852,
              -0.074819505,
              0.008342821,
              0.027451068,
              -0.05248069,
              0.02401768,
              -0.004380289,
              0.039321493,
              -0.04213744,
              -0.027290314,
              0.054677974,
              0.02707243,
              -0.03329442,
              -0.060589895,
              -0.050737355,
              0.017969057,
              -0.0035060972,
              -0.04666249,
              0.073946096,
              0.01333894,
              -0.0033873583,
              -0.046544433,
              -0.060105033,
              0.03406923,
              0.001542676,
              0.039177947,
              0.03989323,
              -0.012346489,
              -0.030511485,
              -0.0019157606,
              -0.014608986,
              -0.012997742,
              0.019522104,
              -0.022349002,
              0.074362256,
              -0.053366993,
              -0.023993475,
              0.029225096,
              0.027534606,
              0.015111057,
              -0.020442221,
              0.043327376,
              0.019660354,
              0.017330697,
              -0.0035011724,
              0.019482937,
              -0.0003428041,
              0.0004143988,
              -0.005117252,
              0.06624799,
              0.027922852,
              0.041020587,
              -0.067166425,
              0.028737254,
              -0.03478325,
              -0.055551115,
              -0.032713737,
              -0.08099247,
              0.09216284,
              0.06395264,
              -0.049168136,
              -0.039908994,
              0.036915958,
              -0.001602359,
              0.00033041168,
              -0.026015632,
              -0.005999889,
              0.05474541,
              -0.09568287,
              -0.05186289,
              -0.048838183,
              -0.08639551,
              -0.034023147,
              -0.033257127,
              -0.05651867,
              -0.051131375,
              0.00809173,
              -0.08581851,
              0.06507323,
              -0.085427366,
              0.027997404,
              0.029847065,
              -0.031673994,
              -0.08560956,
              0.1017672,
              2.1855676e-33,
              0.01160785,
              0.077607885,
              -0.017380483,
              0.005239329,
              0.0009684126,
              0.06543702,
              0.07256893,
              -0.044318836,
              -0.04749324,
              0.14031002,
              -0.025741624,
              0.0057860985,
              0.040946104,
              -0.054880083,
              0.074413285,
              -0.023610368,
              0.018364722,
              -0.060585637,
              -0.044149306,
              0.0027854694,
              -0.04580664,
              0.1172219,
              0.10268574,
              0.07907412,
              -0.0466143,
              0.018618405,
              0.029834948,
              0.037265483,
              0.02273822,
              -0.0026589038,
              0.041726097,
              0.06439532,
              -0.089163445,
              0.018188318,
              0.024064727,
              -0.096389584,
              0.08642254,
              -0.05389359,
              0.01923105,
              0.045092683,
              0.045125954,
              0.09655961,
              0.014908797,
              0.059611585,
              0.03066662,
              0.05882299,
              0.111484826,
              0.016632542,
              0.011590394,
              -0.023702666,
              -0.008617484,
              -0.055030316,
              0.047606383,
              -0.014632687,
              -0.014156344,
              0.069926,
              0.032047603,
              0.042642817,
              -0.053942375,
              0.031047028,
              0.009216673,
              0.033024028,
              -0.019033706,
              0.005568194,
              -0.014985451,
              -0.09193244,
              -0.03210824,
              0.015367608,
              0.029150328,
              0.01250386,
              -0.004827391,
              0.023345906,
              -0.028271332,
              -0.08454125,
              0.051068563,
              -0.0133641455,
              -0.029022738,
              -0.02258452,
              0.010884119,
              -0.009810021,
              0.049751773,
              -0.0032637494,
              -0.038813565,
              0.027924104,
              0.017925078,
              0.005337612,
              0.058691237,
              0.09577674,
              -0.014308608,
              0.006972794,
              -0.02733344,
              0.06912433,
              0.05727631,
              0.03206042,
              0.0042422824,
              -1.6766318e-08,
              -0.036354303,
              -0.09146416,
              -0.026319364,
              -0.007941995,
              -0.024127059,
              0.09896698,
              -0.04723083,
              -0.03767135,
              -0.029419973,
              -0.022513283,
              0.04125822,
              -0.0011487947,
              -0.05570366,
              0.020679709,
              -0.038118906,
              -0.0524994,
              -0.02624128,
              -0.05336954,
              -0.040593866,
              -0.0073642326,
              -0.0014442836,
              0.02714257,
              0.027141048,
              0.00932513,
              -0.00026505854,
              0.038233075,
              0.037096914,
              0.08405413,
              -0.06340637,
              -0.014856458,
              0.05038612,
              0.06703033,
              0.027668556,
              -0.04360097,
              -0.012041474,
              0.08500689,
              0.111594744,
              0.1046117,
              0.019726463,
              -0.0003025109,
              -0.04110389,
              0.009575226,
              -0.05285304,
              -0.0026365265,
              -0.031144748,
              -0.08860188,
              -0.06762232,
              -0.07451522,
              -0.053012833,
              -0.09560941,
              -0.05273455,
              0.013032144,
              0.0029190276,
              0.041905046,
              -0.04522114,
              0.016730292,
              0.017214278,
              0.021578068,
              -0.03718778,
              0.02353425,
              0.052041385,
              0.06444499,
              0.02387539,
              -0.025236009
            ],
            "index": 0,
            "object": "embedding"
          }
        ],
        "model": "all-minilm:l6-v2",
        "object": "list",
        "usage": {
          "prompt_tokens": 6,
          "total_tokens": 6
        }
      }
    },
    "is_streaming": false
  }
 }
--- a/tests/integration/recordings/responses/2afe3b38ca01.json
+++ b/tests/integration/recordings/responses/2afe3b38ca01.json
@ -22,7 +22,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:01.887809Z",
+          "created_at": "2025-09-03T17:37:50.436472Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -40,7 +40,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:01.942369Z",
+          "created_at": "2025-09-03T17:37:50.478138Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -58,7 +58,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:01.99605Z",
+          "created_at": "2025-09-03T17:37:50.519952Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -76,7 +76,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.049974Z",
+          "created_at": "2025-09-03T17:37:50.561433Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -94,7 +94,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.102027Z",
+          "created_at": "2025-09-03T17:37:50.603624Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -112,7 +112,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.158416Z",
+          "created_at": "2025-09-03T17:37:50.645851Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -130,7 +130,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.211753Z",
+          "created_at": "2025-09-03T17:37:50.688403Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -148,7 +148,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.265564Z",
+          "created_at": "2025-09-03T17:37:50.72991Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -166,7 +166,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.31618Z",
+          "created_at": "2025-09-03T17:37:50.771635Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -184,7 +184,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.370325Z",
+          "created_at": "2025-09-03T17:37:50.813711Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -202,7 +202,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.424667Z",
+          "created_at": "2025-09-03T17:37:50.856201Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -220,7 +220,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.47913Z",
+          "created_at": "2025-09-03T17:37:50.899048Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -238,15 +238,15 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.536984Z",
+          "created_at": "2025-09-03T17:37:50.94069Z",
          "done": true,
          "done_reason": "stop",
-          "total_duration": 1042724125,
+          "total_duration": 688370708,
-          "load_duration": 86161375,
+          "load_duration": 107469833,
          "prompt_eval_count": 399,
-          "prompt_eval_duration": 305000000,
+          "prompt_eval_duration": 74988334,
          "eval_count": 13,
-          "eval_duration": 650000000,
+          "eval_duration": 505216458,
          "response": "",
          "thinking": null,
          "context": null
--- a/tests/integration/recordings/responses/2d187a11704c.json
+++ b/tests/integration/recordings/responses/2d187a11704c.json
@ -22,7 +22,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:11.938867Z",
+          "created_at": "2025-09-03T17:37:56.566151Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -40,7 +40,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:11.991247Z",
+          "created_at": "2025-09-03T17:37:56.609308Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -58,7 +58,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.043953Z",
+          "created_at": "2025-09-03T17:37:56.651314Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -76,7 +76,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.096001Z",
+          "created_at": "2025-09-03T17:37:56.693185Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -94,7 +94,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.150454Z",
+          "created_at": "2025-09-03T17:37:56.734643Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -112,7 +112,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.201249Z",
+          "created_at": "2025-09-03T17:37:56.776343Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -130,7 +130,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.252534Z",
+          "created_at": "2025-09-03T17:37:56.81705Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -148,7 +148,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.30063Z",
+          "created_at": "2025-09-03T17:37:56.857959Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -166,7 +166,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.351034Z",
+          "created_at": "2025-09-03T17:37:56.899424Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -184,7 +184,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.405032Z",
+          "created_at": "2025-09-03T17:37:56.939218Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -202,7 +202,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.462645Z",
+          "created_at": "2025-09-03T17:37:56.980065Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -220,7 +220,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.520337Z",
+          "created_at": "2025-09-03T17:37:57.02214Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -238,7 +238,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.575809Z",
+          "created_at": "2025-09-03T17:37:57.0628Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -256,7 +256,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.633724Z",
+          "created_at": "2025-09-03T17:37:57.106061Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -274,7 +274,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.683133Z",
+          "created_at": "2025-09-03T17:37:57.1492Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -292,7 +292,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.734309Z",
+          "created_at": "2025-09-03T17:37:57.190075Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -310,7 +310,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.785917Z",
+          "created_at": "2025-09-03T17:37:57.23178Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -328,7 +328,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.835705Z",
+          "created_at": "2025-09-03T17:37:57.272738Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -346,7 +346,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.886509Z",
+          "created_at": "2025-09-03T17:37:57.313855Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -364,7 +364,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.937134Z",
+          "created_at": "2025-09-03T17:37:57.354964Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -382,7 +382,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.988532Z",
+          "created_at": "2025-09-03T17:37:57.395971Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -400,7 +400,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.041798Z",
+          "created_at": "2025-09-03T17:37:57.438471Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -418,7 +418,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.095443Z",
+          "created_at": "2025-09-03T17:37:57.479796Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -436,7 +436,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.151402Z",
+          "created_at": "2025-09-03T17:37:57.520641Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -454,7 +454,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.203462Z",
+          "created_at": "2025-09-03T17:37:57.561511Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -472,7 +472,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.254567Z",
+          "created_at": "2025-09-03T17:37:57.602875Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -490,7 +490,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.305865Z",
+          "created_at": "2025-09-03T17:37:57.643406Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -508,7 +508,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.357658Z",
+          "created_at": "2025-09-03T17:37:57.684279Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -526,7 +526,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.407773Z",
+          "created_at": "2025-09-03T17:37:57.725699Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -544,7 +544,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.458919Z",
+          "created_at": "2025-09-03T17:37:57.766658Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -562,7 +562,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.510456Z",
+          "created_at": "2025-09-03T17:37:57.80738Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -580,7 +580,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.565948Z",
+          "created_at": "2025-09-03T17:37:57.848466Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -598,7 +598,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.619155Z",
+          "created_at": "2025-09-03T17:37:57.889056Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -616,7 +616,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.672754Z",
+          "created_at": "2025-09-03T17:37:57.931554Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -634,7 +634,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.729473Z",
+          "created_at": "2025-09-03T17:37:57.974754Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -652,7 +652,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.788666Z",
+          "created_at": "2025-09-03T17:37:58.016978Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -670,7 +670,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.850575Z",
+          "created_at": "2025-09-03T17:37:58.057942Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -688,7 +688,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.904807Z",
+          "created_at": "2025-09-03T17:37:58.099015Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -706,7 +706,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.958524Z",
+          "created_at": "2025-09-03T17:37:58.140531Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -724,7 +724,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.011742Z",
+          "created_at": "2025-09-03T17:37:58.181382Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -742,7 +742,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.064933Z",
+          "created_at": "2025-09-03T17:37:58.223318Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -760,7 +760,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.116454Z",
+          "created_at": "2025-09-03T17:37:58.26358Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -778,7 +778,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.172682Z",
+          "created_at": "2025-09-03T17:37:58.305496Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -796,7 +796,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.227654Z",
+          "created_at": "2025-09-03T17:37:58.347254Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -814,7 +814,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.282068Z",
+          "created_at": "2025-09-03T17:37:58.390044Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -832,7 +832,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.334565Z",
+          "created_at": "2025-09-03T17:37:58.430867Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -850,7 +850,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.383532Z",
+          "created_at": "2025-09-03T17:37:58.471376Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -868,7 +868,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.432138Z",
+          "created_at": "2025-09-03T17:37:58.51208Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -886,7 +886,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.480995Z",
+          "created_at": "2025-09-03T17:37:58.553226Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -904,7 +904,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.531968Z",
+          "created_at": "2025-09-03T17:37:58.594787Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -922,7 +922,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.584044Z",
+          "created_at": "2025-09-03T17:37:58.63466Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -940,7 +940,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.635691Z",
+          "created_at": "2025-09-03T17:37:58.674628Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -958,7 +958,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.68837Z",
+          "created_at": "2025-09-03T17:37:58.714616Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -976,7 +976,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.73985Z",
+          "created_at": "2025-09-03T17:37:58.754906Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -994,7 +994,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.792412Z",
+          "created_at": "2025-09-03T17:37:58.795048Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1012,7 +1012,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.845872Z",
+          "created_at": "2025-09-03T17:37:58.835297Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1030,7 +1030,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.900102Z",
+          "created_at": "2025-09-03T17:37:58.875738Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1048,7 +1048,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.954589Z",
+          "created_at": "2025-09-03T17:37:58.91604Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1066,7 +1066,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.006629Z",
+          "created_at": "2025-09-03T17:37:58.956596Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1084,7 +1084,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.058561Z",
+          "created_at": "2025-09-03T17:37:58.996664Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1102,7 +1102,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.111954Z",
+          "created_at": "2025-09-03T17:37:59.037796Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1120,7 +1120,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.169173Z",
+          "created_at": "2025-09-03T17:37:59.078586Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1138,7 +1138,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.222569Z",
+          "created_at": "2025-09-03T17:37:59.119448Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1156,7 +1156,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.275795Z",
+          "created_at": "2025-09-03T17:37:59.160318Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1174,7 +1174,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.3327Z",
+          "created_at": "2025-09-03T17:37:59.201852Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1192,7 +1192,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.389931Z",
+          "created_at": "2025-09-03T17:37:59.243763Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1210,7 +1210,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.442349Z",
+          "created_at": "2025-09-03T17:37:59.284948Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1228,7 +1228,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.494175Z",
+          "created_at": "2025-09-03T17:37:59.325598Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1246,7 +1246,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.545764Z",
+          "created_at": "2025-09-03T17:37:59.366289Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1264,7 +1264,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.599099Z",
+          "created_at": "2025-09-03T17:37:59.406764Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1282,7 +1282,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.649852Z",
+          "created_at": "2025-09-03T17:37:59.447922Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1300,7 +1300,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.698222Z",
+          "created_at": "2025-09-03T17:37:59.488486Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1318,7 +1318,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.747168Z",
+          "created_at": "2025-09-03T17:37:59.529Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1336,7 +1336,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.797196Z",
+          "created_at": "2025-09-03T17:37:59.569417Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1354,7 +1354,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.845587Z",
+          "created_at": "2025-09-03T17:37:59.610542Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1372,7 +1372,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.897171Z",
+          "created_at": "2025-09-03T17:37:59.651411Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1390,7 +1390,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.944524Z",
+          "created_at": "2025-09-03T17:37:59.69241Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1408,7 +1408,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.994467Z",
+          "created_at": "2025-09-03T17:37:59.732339Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1426,7 +1426,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.045224Z",
+          "created_at": "2025-09-03T17:37:59.772462Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1444,7 +1444,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.093853Z",
+          "created_at": "2025-09-03T17:37:59.812507Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1462,7 +1462,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.144847Z",
+          "created_at": "2025-09-03T17:37:59.852762Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1480,7 +1480,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.197888Z",
+          "created_at": "2025-09-03T17:37:59.892984Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1498,7 +1498,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.250854Z",
+          "created_at": "2025-09-03T17:37:59.933555Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1516,7 +1516,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.301995Z",
+          "created_at": "2025-09-03T17:37:59.973778Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1534,7 +1534,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.352508Z",
+          "created_at": "2025-09-03T17:38:00.014923Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1552,7 +1552,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.40259Z",
+          "created_at": "2025-09-03T17:38:00.057464Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1570,7 +1570,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.453514Z",
+          "created_at": "2025-09-03T17:38:00.09902Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1588,7 +1588,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.50378Z",
+          "created_at": "2025-09-03T17:38:00.140492Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1606,7 +1606,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.554395Z",
+          "created_at": "2025-09-03T17:38:00.180239Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1624,7 +1624,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.605795Z",
+          "created_at": "2025-09-03T17:38:00.220364Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1642,7 +1642,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.656313Z",
+          "created_at": "2025-09-03T17:38:00.26097Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1660,7 +1660,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.706438Z",
+          "created_at": "2025-09-03T17:38:00.301228Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1678,7 +1678,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.756444Z",
+          "created_at": "2025-09-03T17:38:00.341631Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1696,7 +1696,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.807687Z",
+          "created_at": "2025-09-03T17:38:00.383006Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1714,7 +1714,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.85835Z",
+          "created_at": "2025-09-03T17:38:00.423509Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1732,7 +1732,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.909311Z",
+          "created_at": "2025-09-03T17:38:00.464702Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1750,7 +1750,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.959327Z",
+          "created_at": "2025-09-03T17:38:00.505914Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1768,7 +1768,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:17.010211Z",
+          "created_at": "2025-09-03T17:38:00.546505Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1786,7 +1786,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:17.061365Z",
+          "created_at": "2025-09-03T17:38:00.587839Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1804,15 +1804,15 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:17.111956Z",
+          "created_at": "2025-09-03T17:38:00.629018Z",
          "done": true,
          "done_reason": "stop",
-          "total_duration": 5499672375,
+          "total_duration": 4303339291,
-          "load_duration": 58161750,
+          "load_duration": 156231250,
          "prompt_eval_count": 36,
-          "prompt_eval_duration": 266000000,
+          "prompt_eval_duration": 81909875,
          "eval_count": 100,
-          "eval_duration": 5174000000,
+          "eval_duration": 4064559292,
          "response": "",
          "thinking": null,
          "context": null
--- a/tests/integration/recordings/responses/325a72db5755.json
+++ b/tests/integration/recordings/responses/325a72db5755.json
--- a/tests/integration/recordings/responses/382c2f22274c.json
+++ b/tests/integration/recordings/responses/382c2f22274c.json
@ -1,7 +1,7 @@
 {
  "request": {
    "method": "POST",
-    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
@ -22,14 +22,14 @@
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
-        "id": "chatcmpl-339",
+        "id": "chatcmpl-442",
        "choices": [
          {
            "finish_reason": "length",
            "index": 0,
            "logprobs": null,
            "message": {
-              "content": "I can guide you through the process, but please note that this is not an official OpenAI API call. OpenAI's API terms and conditions prohibit using their models for malicious purposes.\n\nTo test a model like \"text-temperature\" with a temperature of 0 (i.e., no noise or randomness), we'll need to use a third-party library that connects to the OpenAI API. One such library is `transformers`.\n\nFirst, you need to install the `transformers` and `",
+              "content": "I can guide you on how to use the `test-temperature` parameter with OpenAI's API, but please note that using a temperature of 0 may not produce meaningful results. Temperature is a hyperparameter that controls the level of randomness in the model's output.\n\nOpenAI's API uses a variant of the GPT-3 model, which is trained on a large corpus of text data. The `test-temperature` parameter allows you to adjust the level of randomness in the model's output",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
@ -39,7 +39,7 @@
            }
          }
        ],
-        "created": 1754510065,
+        "created": 1756921254,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
--- a/Show more
+++ b/Show more