Merge branch 'main' into content-extension

2025-10-04 04:04:14 +00:00 · 2025-09-07 12:38:35 -06:00 · 2025-09-07 12:38:35 -06:00 · 354ed48598
commit 354ed48598
parent 4c1f187c71 78cab5331a
227 changed files with 21224 additions and 10798 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -2,13 +2,6 @@ name: 'Run and Record Tests'
 description: 'Run integration tests and handle recording/artifact upload'

 inputs:
-  test-subdirs:
-    description: 'Comma-separated list of test subdirectories to run'
-    required: true
-  test-pattern:
-    description: 'Regex pattern to pass to pytest -k'
-    required: false
-    default: ''
  stack-config:
    description: 'Stack configuration to use'
    required: true
@ -18,10 +11,18 @@ inputs:
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
-  run-vision-tests:
-    description: 'Whether to run vision tests'
+  test-suite:
+    description: 'Test suite to use: base, responses, vision, etc.'
    required: false
-    default: 'false'
+    default: ''
+  test-subdirs:
+    description: 'Comma-separated list of test subdirectories to run; overrides test-suite'
+    required: false
+    default: ''
+  test-pattern:
+    description: 'Regex pattern to pass to pytest -k'
+    required: false
+    default: ''

 runs:
  using: 'composite'
@ -42,7 +43,7 @@ runs:
          --test-subdirs '${{ inputs.test-subdirs }}' \
          --test-pattern '${{ inputs.test-pattern }}' \
          --inference-mode '${{ inputs.inference-mode }}' \
-          ${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
+          --test-suite '${{ inputs.test-suite }}' \
          | tee pytest-${{ inputs.inference-mode }}.log


@ -57,12 +58,7 @@ runs:
          echo "New recordings detected, committing and pushing"
          git add tests/integration/recordings/

-          if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
-            git commit -m "Recordings update from CI (vision)"
-          else
-            git commit -m "Recordings update from CI"
-          fi
-
+          git commit -m "Recordings update from CI (test-suite: ${{ inputs.test-suite }})"
          git fetch origin ${{ github.ref_name }}
          git rebase origin/${{ github.ref_name }}
          echo "Rebased successfully"
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,17 +1,17 @@
 name: Setup Ollama
 description: Start Ollama
 inputs:
-  run-vision-tests:
-    description: 'Run vision tests: "true" or "false"'
+  test-suite:
+    description: 'Test suite to use: base, responses, vision, etc.'
    required: false
-    default: 'false'
+    default: ''
 runs:
  using: "composite"
  steps:
    - name: Start Ollama
      shell: bash
      run: |
-        if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
+        if [ "${{ inputs.test-suite }}" == "vision" ]; then
          image="ollama-with-vision-model"
        else
          image="ollama-with-models"
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -12,10 +12,10 @@ inputs:
    description: 'Provider to setup (ollama or vllm)'
    required: true
    default: 'ollama'
-  run-vision-tests:
-    description: 'Whether to setup provider for vision tests'
+  test-suite:
+    description: 'Test suite to use: base, responses, vision, etc.'
    required: false
-    default: 'false'
+    default: ''
  inference-mode:
    description: 'Inference mode (record or replay)'
    required: true
@ -33,7 +33,7 @@ runs:
      if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }}
      uses: ./.github/actions/setup-ollama
      with:
-        run-vision-tests: ${{ inputs.run-vision-tests }}
+        test-suite: ${{ inputs.test-suite }}

    - name: Setup vllm
      if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }}
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -5,10 +5,11 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Name | File | Purpose |
 | ---- | ---- | ------- |
 | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
+| API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. |
 | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
 | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
 | SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
-| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode |
+| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -0,0 +1,57 @@
+# API Conformance Tests
+# This workflow ensures that API changes maintain backward compatibility and don't break existing integrations
+# It runs schema validation and OpenAPI diff checks to catch breaking changes early
+
+name: API Conformance Tests
+
+run-name: Run the API Conformance test suite on the changes.
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+    types: [opened, synchronize, reopened]
+    paths:
+      - 'llama_stack/**'
+      - '!llama_stack/ui/**'
+      - 'tests/**'
+      - 'uv.lock'
+      - 'pyproject.toml'
+      - '.github/workflows/conformance.yml' # This workflow itself
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
+  # Cancel in-progress runs when new commits are pushed to avoid wasting CI resources
+  cancel-in-progress: true
+
+jobs:
+  # Job to check if API schema changes maintain backward compatibility
+  check-schema-compatibility:
+    runs-on: ubuntu-latest
+    steps:
+      # Using specific version 4.1.7 because 5.0.0 fails when trying to run this locally using `act`
+      # This ensures consistent behavior between local testing and CI
+      - name: Checkout PR Code
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+
+      # Checkout the base branch to compare against (usually main)
+      # This allows us to diff the current changes against the previous state
+      - name: Checkout Base Branch
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          ref: ${{ github.event.pull_request.base.ref }}
+          path: 'base'
+
+      # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
+      - name: Install oasdiff
+        run: |
+          curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
+
+      # Run oasdiff to detect breaking changes in the API specification
+      # This step will fail if incompatible changes are detected, preventing breaking changes from being merged
+      - name: Run OpenAPI Breaking Change Diff
+        run: |
+          oasdiff breaking --fail-on ERR base/docs/_static/llama-stack-spec.yaml docs/_static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \
+          --match-path '^/v1/vector-io' \
+          --match-path '^/v1/vector-dbs'
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -1,6 +1,6 @@
 name: Integration Tests (Replay)

-run-name: Run the integration test suite from tests/integration in replay mode
+run-name: Run the integration test suites from tests/integration in replay mode

 on:
  push:
@ -32,14 +32,6 @@ on:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
-      test-subdirs:
-        description: 'Comma-separated list of test subdirectories to run'
-        type: string
-        default: ''
-      test-pattern:
-        description: 'Regex pattern to pass to pytest -k'
-        type: string
-        default: ''

 concurrency:
  # Skip concurrency for pushes to main - each commit should be tested independently
@ -50,7 +42,7 @@ jobs:

  run-replay-mode-tests:
    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
+    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.test-suite) }}

    strategy:
      fail-fast: false
@ -61,7 +53,7 @@ jobs:
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        run-vision-tests: [true, false]
+        test-suite: [base, vision]

    steps:
      - name: Checkout repository
@ -73,15 +65,13 @@ jobs:
          python-version: ${{ matrix.python-version }}
          client-version: ${{ matrix.client-version }}
          provider: ${{ matrix.provider }}
-          run-vision-tests: ${{ matrix.run-vision-tests }}
+          test-suite: ${{ matrix.test-suite }}
          inference-mode: 'replay'

      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-subdirs: ${{ inputs.test-subdirs }}
-          test-pattern: ${{ inputs.test-pattern }}
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
          provider: ${{ matrix.provider }}
          inference-mode: 'replay'
-          run-vision-tests: ${{ matrix.run-vision-tests }}
+          test-suite: ${{ matrix.test-suite }}
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@ -10,18 +10,18 @@ run-name: Run the integration test suite from tests/integration
 on:
  workflow_dispatch:
    inputs:
-      test-subdirs:
-        description: 'Comma-separated list of test subdirectories to run'
-        type: string
-        default: ''
      test-provider:
        description: 'Test against a specific provider'
        type: string
        default: 'ollama'
-      run-vision-tests:
-        description: 'Whether to run vision tests'
-        type: boolean
-        default: false
+      test-suite:
+        description: 'Test suite to use: base, responses, vision, etc.'
+        type: string
+        default: ''
+      test-subdirs:
+        description: 'Comma-separated list of test subdirectories to run; overrides test-suite'
+        type: string
+        default: ''
      test-pattern:
        description: 'Regex pattern to pass to pytest -k'
        type: string
@ -38,11 +38,11 @@ jobs:
      - name: Echo workflow inputs
        run: |
          echo "::group::Workflow Inputs"
-          echo "test-subdirs: ${{ inputs.test-subdirs }}"
-          echo "test-provider: ${{ inputs.test-provider }}"
-          echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
-          echo "test-pattern: ${{ inputs.test-pattern }}"
          echo "branch: ${{ github.ref_name }}"
+          echo "test-provider: ${{ inputs.test-provider }}"
+          echo "test-suite: ${{ inputs.test-suite }}"
+          echo "test-subdirs: ${{ inputs.test-subdirs }}"
+          echo "test-pattern: ${{ inputs.test-pattern }}"
          echo "::endgroup::"

      - name: Checkout repository
@ -56,15 +56,15 @@ jobs:
          python-version: "3.12"  # Use single Python version for recording
          client-version: "latest"
          provider: ${{ inputs.test-provider || 'ollama' }}
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          test-suite: ${{ inputs.test-suite }}
          inference-mode: 'record'

      - name: Run and record tests
        uses: ./.github/actions/run-and-record-tests
        with:
-          test-pattern: ${{ inputs.test-pattern }}
-          test-subdirs: ${{ inputs.test-subdirs }}
          stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
          provider: ${{ inputs.test-provider || 'ollama' }}
          inference-mode: 'record'
-          run-vision-tests: ${{ inputs.run-vision-tests }}
+          test-suite: ${{ inputs.test-suite }}
+          test-subdirs: ${{ inputs.test-subdirs }}
+          test-pattern: ${{ inputs.test-pattern }}
--- a/.gitignore
+++ b/.gitignore
@ -26,5 +26,7 @@ venv/
 pytest-report.xml
 .coverage
 .python-version
+AGENTS.md
+server.log
 CLAUDE.md
 .claude/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -86,7 +86,7 @@ repos:
        language: python
        pass_filenames: false
        require_serial: true
-        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
+        files: ^llama_stack/distributions/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
      - id: provider-codegen
        name: Provider Codegen
        additional_dependencies:
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -4129,7 +4129,7 @@
                "tags": [
                    "Files"
                ],
-                "description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.",
+                "description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file. Expected expires_after[anchor] = \"created_at\", expires_after[seconds] = <int>. Seconds must be between 3600 and 2592000 (1 hour to 30 days).",
                "parameters": [],
                "requestBody": {
                    "content": {
@ -4143,11 +4143,33 @@
                                    },
                                    "purpose": {
                                        "$ref": "#/components/schemas/OpenAIFilePurpose"
+                                    },
+                                    "expires_after_anchor": {
+                                        "oneOf": [
+                                            {
+                                                "type": "string"
+                                            },
+                                            {
+                                                "type": "null"
+                                            }
+                                        ]
+                                    },
+                                    "expires_after_seconds": {
+                                        "oneOf": [
+                                            {
+                                                "type": "integer"
+                                            },
+                                            {
+                                                "type": "null"
+                                            }
+                                        ]
                                    }
                                },
                                "required": [
                                    "file",
-                                    "purpose"
+                                    "purpose",
+                                    "expires_after_anchor",
+                                    "expires_after_seconds"
                                ]
                            }
                        }
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -2933,6 +2933,10 @@ paths:
        - file: The File object (not file name) to be uploaded.

        - purpose: The intended purpose of the uploaded file.
+
+        - expires_after: Optional form values describing expiration for the file.
+        Expected expires_after[anchor] = "created_at", expires_after[seconds] = <int>.
+        Seconds must be between 3600 and 2592000 (1 hour to 30 days).
      parameters: []
      requestBody:
        content:
@ -2945,9 +2949,19 @@ paths:
                  format: binary
                purpose:
                  $ref: '#/components/schemas/OpenAIFilePurpose'
+                expires_after_anchor:
+                  oneOf:
+                    - type: string
+                    - type: 'null'
+                expires_after_seconds:
+                  oneOf:
+                    - type: integer
+                    - type: 'null'
              required:
                - file
                - purpose
+                - expires_after_anchor
+                - expires_after_seconds
        required: true
  /v1/openai/v1/models:
    get:
--- a/docs/source/contributing/testing/record-replay.md
+++ b/docs/source/contributing/testing/record-replay.md
@ -40,18 +40,15 @@ The system patches OpenAI and Ollama client methods to intercept calls before th

 ### Storage Architecture

-Recordings use a two-tier storage system optimized for both speed and debuggability:
+Recordings are stored as JSON files in the recording directory. They are looked up by their request hash.

 ```
 recordings/
-├── index.sqlite          # Fast lookup by request hash
 └── responses/
    ├── abc123def456.json  # Individual response files
    └── def789ghi012.json
 ```

-**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
-
 **JSON files** store complete request/response pairs in human-readable format for debugging.

 ## Recording Modes
@ -166,8 +163,8 @@ This preserves type safety - when replayed, you get the same Pydantic objects wi
 Control recording behavior globally:

 ```bash
-export LLAMA_STACK_TEST_INFERENCE_MODE=replay
-export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
+export LLAMA_STACK_TEST_INFERENCE_MODE=replay   # this is the default
+export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings   # default is tests/integration/recordings
 pytest tests/integration/
 ```

--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo
 apis:
 - agents
 - inference
+- safety
 - telemetry
 - tool_runtime
 - vector_io
@ -30,6 +31,11 @@ providers:
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
@ -95,6 +101,8 @@ models:
 - model_id: ${env.INFERENCE_MODEL}
  provider_id: vllm-inference
  model_type: llm
+shields:
+- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
 vector_dbs: []
 datasets: []
 scoring_fns: []
--- a/docs/source/distributions/self_hosted_distro/nvidia.md
+++ b/docs/source/distributions/self_hosted_distro/nvidia.md
@ -50,6 +50,7 @@ The following models are available by default:
 - `meta/llama-3.2-11b-vision-instruct `
 - `meta/llama-3.2-90b-vision-instruct `
 - `meta/llama-3.3-70b-instruct `
+- `nvidia/vila `
 - `nvidia/llama-3.2-nv-embedqa-1b-v2 `
 - `nvidia/nv-embedqa-e5-v5 `
 - `nvidia/nv-embedqa-mistral-7b-v2 `
--- a/docs/source/getting_started/demo_script.py
+++ b/docs/source/getting_started/demo_script.py
@ -18,12 +18,13 @@ embedding_model_id = (
 ).identifier
 embedding_dimension = em.metadata["embedding_dimension"]

-_ = client.vector_dbs.register(
+vector_db = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model_id,
    embedding_dimension=embedding_dimension,
    provider_id="faiss",
 )
+vector_db_id = vector_db.identifier
 source = "https://www.paulgraham.com/greatwork.html"
 print("rag_tool> Ingesting document:", source)
 document = RAGDocument(
@ -35,7 +36,7 @@ document = RAGDocument(
 client.tool_runtime.rag_tool.insert(
    documents=[document],
    vector_db_id=vector_db_id,
-    chunk_size_in_tokens=50,
+    chunk_size_in_tokens=100,
 )
 agent = Agent(
    client,
--- a/docs/source/providers/inference/remote_bedrock.md
+++ b/docs/source/providers/inference/remote_bedrock.md
@ -15,8 +15,8 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
 | `profile_name` | `str \| None` | No |  | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
 | `total_max_attempts` | `int \| None` | No |  | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
 | `retry_mode` | `str \| None` | No |  | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
-| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
-| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
+| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
+| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
 | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |

 ## Sample Configuration
--- a/docs/source/providers/safety/remote_bedrock.md
+++ b/docs/source/providers/safety/remote_bedrock.md
@ -15,8 +15,8 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
 | `profile_name` | `str \| None` | No |  | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
 | `total_max_attempts` | `int \| None` | No |  | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
 | `retry_mode` | `str \| None` | No |  | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
-| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
-| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
+| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
+| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
 | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |

 ## Sample Configuration
--- a/docs/source/providers/vector_io/remote_pgvector.md
+++ b/docs/source/providers/vector_io/remote_pgvector.md
@ -12,6 +12,60 @@ That means you'll get fast and efficient vector retrieval.
 - Easy to use
 - Fully integrated with Llama Stack

+There are three implementations of search for PGVectoIndex available:
+
+1. Vector Search:
+- How it works:
+  - Uses PostgreSQL's vector extension (pgvector) to perform similarity search
+  - Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
+  - Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
+
+-Characteristics:
+  - Semantic understanding - finds documents similar in meaning even if they don't share keywords
+  - Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
+  - Best for: Finding conceptually related content, handling synonyms, cross-language search
+
+2. Keyword Search
+- How it works:
+  - Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
+  - Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
+  - Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
+
+- Characteristics:
+  - Lexical matching - finds exact keyword matches and variations
+  - Uses GIN (Generalized Inverted Index) for fast text search performance
+  - Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
+  - Best for: Exact term matching, proper names, technical terms, Boolean-style queries
+
+3. Hybrid Search
+- How it works:
+  - Combines both vector and keyword search results
+  - Runs both searches independently, then merges results using configurable reranking
+
+- Two reranking strategies available:
+    - Reciprocal Rank Fusion (RRF) - (default: 60.0)
+    - Weighted Average - (default: 0.5)
+
+- Characteristics:
+  - Best of both worlds: semantic understanding + exact matching
+  - Documents appearing in both searches get boosted scores
+  - Configurable balance between semantic and lexical matching
+  - Best for: General-purpose search where you want both precision and recall
+
+4. Database Schema
+The PGVector implementation stores data optimized for all three search types:
+CREATE TABLE vector_store_xxx (
+    id TEXT PRIMARY KEY,
+    document JSONB,                    -- Original document
+    embedding vector(dimension),        -- For vector search
+    content_text TEXT,                 -- Raw text content
+    tokenized_content TSVECTOR          -- For keyword search
+);
+
+-- Indexes for performance
+CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content);  -- Keyword search
+-- Vector index created automatically by pgvector
+
 ## Usage

 To use PGVector in your Llama Stack project, follow these steps:
@ -20,6 +74,25 @@ To use PGVector in your Llama Stack project, follow these steps:
 2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
 3. Start storing and querying vectors.

+## This is an example how you can set up your environment for using PGVector
+
+1. Export env vars:
+```bash
+export ENABLE_PGVECTOR=true
+export PGVECTOR_HOST=localhost
+export PGVECTOR_PORT=5432
+export PGVECTOR_DB=llamastack
+export PGVECTOR_USER=llamastack
+export PGVECTOR_PASSWORD=llamastack
+```
+
+2. Create DB:
+```bash
+psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
+psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
+psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
+```
+
 ## Installation

 You can install PGVector using docker:
--- a/docs/source/providers/vector_io/remote_weaviate.md
+++ b/docs/source/providers/vector_io/remote_weaviate.md
@ -17,6 +17,7 @@ Weaviate supports:
 - Metadata filtering
 - Multi-modal retrieval

+
 ## Usage

 To use Weaviate in your Llama Stack project, follow these steps:
--- a/docs/source/references/llama_stack_client_cli_reference.md
+++ b/docs/source/references/llama_stack_client_cli_reference.md
@ -478,7 +478,6 @@ llama-stack-client scoring_functions list
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
 ┃ identifier                                 ┃ provider_id  ┃ description                                                   ┃ type             ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
-│ basic::bfcl                                │ basic        │ BFCL complex scoring                                          │ scoring_function │
 │ basic::docvqa                              │ basic        │ DocVQA Visual Question & Answer scoring function              │ scoring_function │
 │ basic::equality                            │ basic        │ Returns 1.0 if the input is equal to the target, 0.0          │ scoring_function │
 │                                            │              │ otherwise.                                                    │                  │
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -5,10 +5,10 @@
 # the root directory of this source tree.

 from enum import StrEnum
-from typing import Annotated, Literal, Protocol, runtime_checkable
+from typing import Annotated, ClassVar, Literal, Protocol, runtime_checkable

 from fastapi import File, Form, Response, UploadFile
-from pydantic import BaseModel
+from pydantic import BaseModel, Field

 from llama_stack.apis.common.responses import Order
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -49,6 +49,23 @@ class OpenAIFileObject(BaseModel):
    purpose: OpenAIFilePurpose


+@json_schema_type
+class ExpiresAfter(BaseModel):
+    """
+    Control expiration of uploaded files.
+
+    Params:
+     - anchor, must be "created_at"
+     - seconds, must be int between 3600 and 2592000 (1 hour to 30 days)
+    """
+
+    MIN: ClassVar[int] = 3600  # 1 hour
+    MAX: ClassVar[int] = 2592000  # 30 days
+
+    anchor: Literal["created_at"]
+    seconds: int = Field(..., ge=3600, le=2592000)
+
+
@json_schema_type
 class ListOpenAIFileResponse(BaseModel):
    """
@ -92,6 +109,9 @@ class Files(Protocol):
        self,
        file: Annotated[UploadFile, File()],
        purpose: Annotated[OpenAIFilePurpose, Form()],
+        expires_after_anchor: Annotated[str | None, Form(alias="expires_after[anchor]")] = None,
+        expires_after_seconds: Annotated[int | None, Form(alias="expires_after[seconds]")] = None,
+        # TODO: expires_after is producing strange openapi spec, params are showing up as a required w/ oneOf being null
    ) -> OpenAIFileObject:
        """
        Upload a file that can be used across various endpoints.
@ -99,6 +119,7 @@ class Files(Protocol):
        The file upload should be a multipart form request with:
        - file: The File object (not file name) to be uploaded.
        - purpose: The intended purpose of the uploaded file.
+        - expires_after: Optional form values describing expiration for the file. Expected expires_after[anchor] = "created_at", expires_after[seconds] = <int>. Seconds must be between 3600 and 2592000 (1 hour to 30 days).

        :param file: The uploaded file object containing content and metadata (filename, content_type, etc.).
        :param purpose: The intended purpose of the uploaded file (e.g., "assistants", "fine-tune").
--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -284,7 +284,15 @@ async def instantiate_providers(
        if provider.provider_id is None:
            continue

-        deps = {a: impls[a] for a in provider.spec.api_dependencies}
+        try:
+            deps = {a: impls[a] for a in provider.spec.api_dependencies}
+        except KeyError as e:
+            missing_api = e.args[0]
+            raise RuntimeError(
+                f"Failed to resolve '{provider.spec.api.value}' provider '{provider.provider_id}' of type '{provider.spec.provider_type}': "
+                f"required dependency '{missing_api.value}' is not available. "
+                f"Please add a '{missing_api.value}' provider to your configuration or check if the provider is properly configured."
+            ) from e
        for a in provider.spec.optional_api_dependencies:
            if a in impls:
                deps[a] = impls[a]
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -527,7 +527,7 @@ class InferenceRouter(Inference):

        # Store the response with the ID that will be returned to the client
        if self.store:
-            await self.store.store_chat_completion(response, messages)
+            asyncio.create_task(self.store.store_chat_completion(response, messages))

        if self.telemetry:
            metrics = self._construct_metrics(
@ -755,7 +755,7 @@ class InferenceRouter(Inference):
                            choices_data[idx] = {
                                "content_parts": [],
                                "tool_calls_builder": {},
-                                "finish_reason": None,
+                                "finish_reason": "stop",
                                "logprobs_content_parts": [],
                            }
                        current_choice_data = choices_data[idx]
@ -855,4 +855,4 @@ class InferenceRouter(Inference):
                    object="chat.completion",
                )
                logger.debug(f"InferenceRouter.completion_response: {final_response}")
-                await self.store.store_chat_completion(final_response, messages)
+                asyncio.create_task(self.store.store_chat_completion(final_response, messages))
--- a/llama_stack/core/routing_tables/vector_dbs.py
+++ b/llama_stack/core/routing_tables/vector_dbs.py
@ -52,7 +52,6 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
        provider_vector_db_id: str | None = None,
        vector_db_name: str | None = None,
    ) -> VectorDB:
-        provider_vector_db_id = provider_vector_db_id or vector_db_id
        if provider_id is None:
            if len(self.impls_by_provider_id) > 0:
                provider_id = list(self.impls_by_provider_id.keys())[0]
@ -69,14 +68,33 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
            raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
        if "embedding_dimension" not in model.metadata:
            raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
+
+        provider = self.impls_by_provider_id[provider_id]
+        logger.warning(
+            "VectorDB is being deprecated in future releases in favor of VectorStore. Please migrate your usage accordingly."
+        )
+        vector_store = await provider.openai_create_vector_store(
+            name=vector_db_name or vector_db_id,
+            embedding_model=embedding_model,
+            embedding_dimension=model.metadata["embedding_dimension"],
+            provider_id=provider_id,
+            provider_vector_db_id=provider_vector_db_id,
+        )
+
+        vector_store_id = vector_store.id
+        actual_provider_vector_db_id = provider_vector_db_id or vector_store_id
+        logger.warning(
+            f"Ignoring vector_db_id {vector_db_id} and using vector_store_id {vector_store_id} instead. Setting VectorDB {vector_db_id} to VectorDB.vector_db_name"
+        )
+
        vector_db_data = {
-            "identifier": vector_db_id,
+            "identifier": vector_store_id,
            "type": ResourceType.vector_db.value,
            "provider_id": provider_id,
-            "provider_resource_id": provider_vector_db_id,
+            "provider_resource_id": actual_provider_vector_db_id,
            "embedding_model": embedding_model,
            "embedding_dimension": model.metadata["embedding_dimension"],
-            "vector_db_name": vector_db_name,
+            "vector_db_name": vector_store.name,
        }
        vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
        await self.register_object(vector_db)
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -132,15 +132,17 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
            },
        )
    elif isinstance(exc, ConflictError):
-        return HTTPException(status_code=409, detail=str(exc))
+        return HTTPException(status_code=httpx.codes.CONFLICT, detail=str(exc))
    elif isinstance(exc, ResourceNotFoundError):
-        return HTTPException(status_code=404, detail=str(exc))
+        return HTTPException(status_code=httpx.codes.NOT_FOUND, detail=str(exc))
    elif isinstance(exc, ValueError):
        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
    elif isinstance(exc, BadRequestError):
        return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
    elif isinstance(exc, PermissionError | AccessDeniedError):
        return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
+    elif isinstance(exc, ConnectionError | httpx.ConnectError):
+        return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc))
    elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
        return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
    elif isinstance(exc, NotImplementedError):
--- a/llama_stack/core/stack.py
+++ b/llama_stack/core/stack.py
@ -105,12 +105,12 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):

        method = getattr(impls[api], register_method)
        for obj in objects:
-            logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")
-
-            # Do not register models on disabled providers
-            if hasattr(obj, "provider_id") and (not obj.provider_id or obj.provider_id == "__disabled__"):
-                logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
-                continue
+            if hasattr(obj, "provider_id"):
+                # Do not register models on disabled providers
+                if not obj.provider_id or obj.provider_id == "__disabled__":
+                    logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
+                    continue
+                logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")

            # we want to maintain the type information in arguments to method.
            # instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
--- a/llama_stack/distributions/ci-tests/ci_tests.py
+++ b/llama_stack/distributions/ci-tests/ci_tests.py
@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut


 def get_distribution_template() -> DistributionTemplate:
-    template = get_starter_distribution_template()
-    name = "ci-tests"
-    template.name = name
+    template = get_starter_distribution_template(name="ci-tests")
    template.description = "CI tests for Llama Stack"

    return template
--- a/llama_stack/distributions/ci-tests/run.yaml
+++ b/llama_stack/distributions/ci-tests/run.yaml
@ -89,28 +89,28 @@ providers:
    config:
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/faiss_store.db
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec_registry.db
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/ci-tests}/milvus.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/milvus_registry.db
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests/}/chroma_remote_registry.db
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
@ -121,15 +121,15 @@ providers:
      password: ${env.PGVECTOR_PASSWORD:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/pgvector_registry.db
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ci-tests/files}
      metadata_store:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/llama_stack/distributions/nvidia/run.yaml
+++ b/llama_stack/distributions/nvidia/run.yaml
@ -134,6 +134,11 @@ models:
  provider_id: nvidia
  provider_model_id: meta/llama-3.3-70b-instruct
  model_type: llm
+- metadata: {}
+  model_id: nvidia/vila
+  provider_id: nvidia
+  provider_model_id: nvidia/vila
+  model_type: llm
 - metadata:
    embedding_dimension: 2048
    context_length: 8192
--- a/llama_stack/distributions/open-benchmark/open_benchmark.py
+++ b/llama_stack/distributions/open-benchmark/open_benchmark.py
@ -43,7 +43,7 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
            "openai",
            [
                ProviderModelEntry(
-                    provider_model_id="openai/gpt-4o",
+                    provider_model_id="gpt-4o",
                    model_type=ModelType.llm,
                )
            ],
@ -53,7 +53,7 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo
            "anthropic",
            [
                ProviderModelEntry(
-                    provider_model_id="anthropic/claude-3-5-sonnet-latest",
+                    provider_model_id="claude-3-5-sonnet-latest",
                    model_type=ModelType.llm,
                )
            ],
@ -206,13 +206,6 @@ def get_distribution_template() -> DistributionTemplate:
                uri="huggingface://datasets/llamastack/math_500?split=test",
            ),
        ),
-        DatasetInput(
-            dataset_id="bfcl",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/bfcl_v3?split=train",
-            ),
-        ),
        DatasetInput(
            dataset_id="ifeval",
            purpose=DatasetPurpose.eval_messages_answer,
@ -250,11 +243,6 @@ def get_distribution_template() -> DistributionTemplate:
            dataset_id="math_500",
            scoring_functions=["basic::regex_parser_math_response"],
        ),
-        BenchmarkInput(
-            benchmark_id="meta-reference-bfcl",
-            dataset_id="bfcl",
-            scoring_functions=["basic::bfcl"],
-        ),
        BenchmarkInput(
            benchmark_id="meta-reference-ifeval",
            dataset_id="ifeval",
--- a/llama_stack/distributions/open-benchmark/run.yaml
+++ b/llama_stack/distributions/open-benchmark/run.yaml
@ -136,14 +136,14 @@ inference_store:
  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/open-benchmark}/inference_store.db
 models:
 - metadata: {}
-  model_id: openai/gpt-4o
+  model_id: gpt-4o
  provider_id: openai
-  provider_model_id: openai/gpt-4o
+  provider_model_id: gpt-4o
  model_type: llm
 - metadata: {}
-  model_id: anthropic/claude-3-5-sonnet-latest
+  model_id: claude-3-5-sonnet-latest
  provider_id: anthropic
-  provider_model_id: anthropic/claude-3-5-sonnet-latest
+  provider_model_id: claude-3-5-sonnet-latest
  model_type: llm
 - metadata: {}
  model_id: gemini/gemini-1.5-flash
@ -188,12 +188,6 @@ datasets:
    uri: huggingface://datasets/llamastack/math_500?split=test
  metadata: {}
  dataset_id: math_500
- purpose: eval/messages-answer
-  source:
-    type: uri
-    uri: huggingface://datasets/llamastack/bfcl_v3?split=train
-  metadata: {}
-  dataset_id: bfcl
 - purpose: eval/messages-answer
  source:
    type: uri
@ -228,11 +222,6 @@ benchmarks:
  - basic::regex_parser_math_response
  metadata: {}
  benchmark_id: meta-reference-math-500
- dataset_id: bfcl
-  scoring_functions:
-  - basic::bfcl
-  metadata: {}
-  benchmark_id: meta-reference-bfcl
 - dataset_id: ifeval
  scoring_functions:
  - basic::ifeval
--- a/llama_stack/distributions/starter-gpu/run.yaml
+++ b/llama_stack/distributions/starter-gpu/run.yaml
@ -89,28 +89,28 @@ providers:
    config:
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/faiss_store.db
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
-      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec_registry.db
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
-      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/milvus_registry.db
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu/}/chroma_remote_registry.db
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
@ -121,15 +121,15 @@ providers:
      password: ${env.PGVECTOR_PASSWORD:=}
      kvstore:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/pgvector_registry.db
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
-      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
      metadata_store:
        type: sqlite
-        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/files_metadata.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
--- a/llama_stack/distributions/starter-gpu/starter_gpu.py
+++ b/llama_stack/distributions/starter-gpu/starter_gpu.py
@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut


 def get_distribution_template() -> DistributionTemplate:
-    template = get_starter_distribution_template()
-    name = "starter-gpu"
-    template.name = name
+    template = get_starter_distribution_template(name="starter-gpu")
    template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments."

    template.providers["post_training"] = [
--- a/llama_stack/distributions/starter/starter.py
+++ b/llama_stack/distributions/starter/starter.py
@ -99,9 +99,8 @@ def get_remote_inference_providers() -> list[Provider]:
    return inference_providers


-def get_distribution_template() -> DistributionTemplate:
+def get_distribution_template(name: str = "starter") -> DistributionTemplate:
    remote_inference_providers = get_remote_inference_providers()
-    name = "starter"

    providers = {
        "inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in remote_inference_providers]
--- a/llama_stack/providers/inline/batches/reference/batches.py
+++ b/llama_stack/providers/inline/batches/reference/batches.py
@ -178,9 +178,9 @@ class ReferenceBatchesImpl(Batches):

        # TODO: set expiration time for garbage collection

-        if endpoint not in ["/v1/chat/completions"]:
+        if endpoint not in ["/v1/chat/completions", "/v1/completions"]:
            raise ValueError(
-                f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions. Code: invalid_value. Param: endpoint",
+                f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions, /v1/completions. Code: invalid_value. Param: endpoint",
            )

        if completion_window != "24h":
@ -424,13 +424,21 @@ class ReferenceBatchesImpl(Batches):
                            )
                            valid = False

-                        for param, expected_type, type_string in [
-                            ("model", str, "a string"),
-                            # messages is specific to /v1/chat/completions
-                            # we could skip validating messages here and let inference fail. however,
-                            # that would be a very expensive way to find out messages is wrong.
-                            ("messages", list, "an array"),  # TODO: allow messages to be a string?
-                        ]:
+                        if batch.endpoint == "/v1/chat/completions":
+                            required_params = [
+                                ("model", str, "a string"),
+                                # messages is specific to /v1/chat/completions
+                                # we could skip validating messages here and let inference fail. however,
+                                # that would be a very expensive way to find out messages is wrong.
+                                ("messages", list, "an array"),  # TODO: allow messages to be a string?
+                            ]
+                        else:  # /v1/completions
+                            required_params = [
+                                ("model", str, "a string"),
+                                ("prompt", str, "a string"),  # TODO: allow prompt to be a list of strings??
+                            ]
+
+                        for param, expected_type, type_string in required_params:
                            if param not in body:
                                errors.append(
                                    BatchError(
@ -591,20 +599,37 @@ class ReferenceBatchesImpl(Batches):

        try:
            # TODO(SECURITY): review body for security issues
-            request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]]
-            chat_response = await self.inference_api.openai_chat_completion(**request.body)
+            if request.url == "/v1/chat/completions":
+                request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]]
+                chat_response = await self.inference_api.openai_chat_completion(**request.body)

-            # this is for mypy, we don't allow streaming so we'll get the right type
-            assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method"
-            return {
-                "id": request_id,
-                "custom_id": request.custom_id,
-                "response": {
-                    "status_code": 200,
-                    "request_id": request_id,  # TODO: should this be different?
-                    "body": chat_response.model_dump_json(),
-                },
-            }
+                # this is for mypy, we don't allow streaming so we'll get the right type
+                assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method"
+                return {
+                    "id": request_id,
+                    "custom_id": request.custom_id,
+                    "response": {
+                        "status_code": 200,
+                        "request_id": request_id,  # TODO: should this be different?
+                        "body": chat_response.model_dump_json(),
+                    },
+                }
+            else:  # /v1/completions
+                completion_response = await self.inference_api.openai_completion(**request.body)
+
+                # this is for mypy, we don't allow streaming so we'll get the right type
+                assert hasattr(completion_response, "model_dump_json"), (
+                    "Completion response must have model_dump_json method"
+                )
+                return {
+                    "id": request_id,
+                    "custom_id": request.custom_id,
+                    "response": {
+                        "status_code": 200,
+                        "request_id": request_id,
+                        "body": completion_response.model_dump_json(),
+                    },
+                }
        except Exception as e:
            logger.info(f"Error processing request {request.custom_id} in batch {batch_id}: {e}")
            return {
--- a/llama_stack/providers/inline/files/localfs/files.py
+++ b/llama_stack/providers/inline/files/localfs/files.py
@ -86,11 +86,16 @@ class LocalfsFilesImpl(Files):
        self,
        file: Annotated[UploadFile, File()],
        purpose: Annotated[OpenAIFilePurpose, Form()],
+        expires_after_anchor: Annotated[str | None, Form(alias="expires_after[anchor]")] = None,
+        expires_after_seconds: Annotated[int | None, Form(alias="expires_after[seconds]")] = None,
    ) -> OpenAIFileObject:
        """Upload a file that can be used across various endpoints."""
        if not self.sql_store:
            raise RuntimeError("Files provider not initialized")

+        if expires_after_anchor is not None or expires_after_seconds is not None:
+            raise NotImplementedError("File expiration is not supported by this provider")
+
        file_id = self._generate_file_id()
        file_path = self._get_file_path(file_id)

--- a/llama_stack/providers/inline/scoring/basic/scoring.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring.py
@ -22,7 +22,6 @@ from llama_stack.providers.utils.common.data_schema_validator import (
 )

 from .config import BasicScoringConfig
-from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
 from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
 from .scoring_fn.equality_scoring_fn import EqualityScoringFn
 from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
@ -37,7 +36,6 @@ FIXED_FNS = [
    SubsetOfScoringFn,
    RegexParserScoringFn,
    RegexParserMathResponseScoringFn,
-    BFCLScoringFn,
    IfEvalScoringFn,
    DocVQAScoringFn,
 ]
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/bfcl_scoring_fn.py
@ -1,93 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-import re
-from typing import Any
-
-from llama_stack.apis.scoring import ScoringResultRow
-from llama_stack.apis.scoring_functions import ScoringFnParams
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-
-from ..utils.bfcl.ast_parser import decode_ast
-from ..utils.bfcl.checker import ast_checker, is_empty_output
-from .fn_defs.bfcl import bfcl
-
-
-def postprocess(x: dict[str, Any], test_category: str) -> dict[str, Any]:
-    contain_func_call = False
-    error = None
-    error_type = None
-    checker_result = {}
-    try:
-        prediction = decode_ast(x["generated_answer"], x["language"]) or ""
-        contain_func_call = True
-        # if not is_function_calling_format_output(prediction):
-        if is_empty_output(prediction):
-            contain_func_call = False
-            error = "Did not output in the specified format. Note: the model_result is wrapped in a string to ensure json serializability."
-            error_type = "ast_decoder:decoder_wrong_output_format"
-        else:
-            checker_result = ast_checker(
-                json.loads(x["function"]),
-                prediction,
-                json.loads(x["ground_truth"]),
-                x["language"],
-                test_category=test_category,
-                model_name="",
-            )
-    except Exception as e:
-        prediction = ""
-        error = f"Invalid syntax. Failed to decode AST. {str(e)}"
-        error_type = "ast_decoder:decoder_failed"
-    return {
-        "prediction": prediction,
-        "contain_func_call": contain_func_call,
-        "valid": checker_result.get("valid", False),
-        "error": error or checker_result.get("error", ""),
-        "error_type": error_type or checker_result.get("error_type", ""),
-    }
-
-
-def gen_valid(x: dict[str, Any]) -> dict[str, float]:
-    return {"valid": x["valid"]}
-
-
-def gen_relevance_acc(x: dict[str, Any]) -> dict[str, float]:
-    # This function serves for both relevance and irrelevance tests, which share the exact opposite logic.
-    # If `test_category` is "irrelevance", the model is expected to output no function call.
-    # No function call means either the AST decoding fails (a error message is generated) or the decoded AST does not contain any function call (such as a empty list, `[]`).
-    # If `test_category` is "relevance", the model is expected to output to a function call, and empty list doesn't count as a function call.
-    acc = not x["contain_func_call"] if "irrelevance" in x["id"] else x["contain_func_call"]
-    return {"valid": float(acc)}
-
-
-class BFCLScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn for BFCL
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            bfcl.identifier: bfcl,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = "bfcl",
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        test_category = re.sub(r"_[0-9_-]+$", "", input_row["id"])
-        score_result = postprocess(input_row, test_category)
-        if test_category in {"irrelevance", "live_relevance", "live_irrelevance"}:
-            score = gen_relevance_acc(score_result)["valid"]
-        else:
-            score = gen_valid(score_result)["valid"]
-        return {
-            "score": float(score),
-        }
--- a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
+++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/bfcl.py
@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.apis.common.type_system import NumberType
-from llama_stack.apis.scoring_functions import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    ScoringFn,
-)
-
-bfcl = ScoringFn(
-    identifier="basic::bfcl",
-    description="BFCL complex scoring",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="bfcl",
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
-)
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/ast_parser.py
@ -1,296 +0,0 @@
-# ruff: noqa
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import ast
-
-from .tree_sitter import get_parser
-
-
-def parse_java_function_call(source_code):
-    if not source_code.endswith(";"):
-        source_code += ";"  # Necessary for the parser not to register an error
-    parser = get_parser("java")
-    tree = parser.parse(bytes(source_code, "utf8"))
-    root_node = tree.root_node
-
-    if root_node.has_error:
-        raise Exception("Error parsing java the source code.")
-
-    def get_text(node):
-        """Returns the text represented by the node."""
-        return source_code[node.start_byte : node.end_byte]
-
-    def traverse_node(node, nested=False):
-        if node.type == "string_literal":
-            if nested:
-                return get_text(node)
-            # Strip surrounding quotes from string literals
-            return get_text(node)[1:-1]
-        elif node.type == "character_literal":
-            if nested:
-                return get_text(node)
-            # Strip surrounding single quotes from character literals
-            return get_text(node)[1:-1]
-        """Traverse the node to collect texts for complex structures."""
-        if node.type in [
-            "identifier",
-            "class_literal",
-            "type_identifier",
-            "method_invocation",
-        ]:
-            return get_text(node)
-        elif node.type == "array_creation_expression":
-            # Handle array creation expression specifically
-            type_node = node.child_by_field_name("type")
-            value_node = node.child_by_field_name("value")
-            type_text = traverse_node(type_node, True)
-            value_text = traverse_node(value_node, True)
-            return f"new {type_text}[]{value_text}"
-        elif node.type == "object_creation_expression":
-            # Handle object creation expression specifically
-            type_node = node.child_by_field_name("type")
-            arguments_node = node.child_by_field_name("arguments")
-            type_text = traverse_node(type_node, True)
-            if arguments_node:
-                # Process each argument carefully, avoiding unnecessary punctuation
-                argument_texts = []
-                for child in arguments_node.children:
-                    if child.type not in [
-                        ",",
-                        "(",
-                        ")",
-                    ]:  # Exclude commas and parentheses
-                        argument_text = traverse_node(child, True)
-                        argument_texts.append(argument_text)
-                arguments_text = ", ".join(argument_texts)
-                return f"new {type_text}({arguments_text})"
-            else:
-                return f"new {type_text}()"
-        elif node.type == "set":
-            # Handling sets specifically
-            items = [traverse_node(n, True) for n in node.children if n.type not in [",", "set"]]
-            return "{" + ", ".join(items) + "}"
-
-        elif node.child_count > 0:
-            return "".join(traverse_node(child, True) for child in node.children)
-        else:
-            return get_text(node)
-
-    def extract_arguments(args_node):
-        arguments = {}
-        for child in args_node.children:
-            if child.type == "assignment_expression":
-                # For named parameters
-                name_node, value_node = child.children[0], child.children[2]
-                name = get_text(name_node)
-                value = traverse_node(value_node)
-                if name in arguments:
-                    if not isinstance(arguments[name], list):
-                        arguments[name] = [arguments[name]]
-                    arguments[name].append(value)
-                else:
-                    arguments[name] = value
-                # arguments.append({'name': name, 'value': value})
-            elif child.type in ["identifier", "class_literal", "set"]:
-                # For unnamed parameters and handling sets
-                value = traverse_node(child)
-                if None in arguments:
-                    if not isinstance(arguments[None], list):
-                        arguments[None] = [arguments[None]]
-                    arguments[None].append(value)
-                else:
-                    arguments[None] = value
-        return arguments
-
-    def traverse(node):
-        if node.type == "method_invocation":
-            # Extract the function name and its arguments
-            method_name = get_text(node.child_by_field_name("name"))
-            class_name_node = node.child_by_field_name("object")
-            if class_name_node:
-                class_name = get_text(class_name_node)
-                function_name = f"{class_name}.{method_name}"
-            else:
-                function_name = method_name
-            arguments_node = node.child_by_field_name("arguments")
-            if arguments_node:
-                arguments = extract_arguments(arguments_node)
-                for key, value in arguments.items():
-                    if isinstance(value, list):
-                        raise Exception("Error: Multiple arguments with the same name are not supported.")
-                return [{function_name: arguments}]
-
-        else:
-            for child in node.children:
-                result = traverse(child)
-                if result:
-                    return result
-
-    result = traverse(root_node)
-    return result if result else {}
-
-
-def parse_javascript_function_call(source_code):
-    if not source_code.endswith(";"):
-        source_code += ";"  # Necessary for the parser not to register an error
-    parser = get_parser("javascript")
-    # Parse the source code
-    tree = parser.parse(bytes(source_code, "utf8"))
-    root_node = tree.root_node
-    if root_node.has_error:
-        raise Exception("Error js parsing the source code.")
-
-    # Function to recursively extract argument details
-    def extract_arguments(node):
-        args = {}
-        for child in node.children:
-            if child.type == "assignment_expression":
-                # Extract left (name) and right (value) parts of the assignment
-                name = child.children[0].text.decode("utf-8")
-                value = child.children[2].text.decode("utf-8")
-                if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
-                    value = value[1:-1]  # Trim the quotation marks
-                if name in args:
-                    if not isinstance(args[name], list):
-                        args[name] = [args[name]]
-                    args[name].append(value)
-                else:
-                    args[name] = value
-
-            elif child.type == "identifier" or child.type == "true":
-                # Handle non-named arguments and boolean values
-                value = child.text.decode("utf-8")
-                if None in args:
-                    if not isinstance(args[None], list):
-                        args[None] = [args[None]]
-                    args[None].append(value)
-                else:
-                    args[None] = value
-        return args
-
-    # Find the function call and extract its name and arguments
-    if root_node.type == "program":
-        for child in root_node.children:
-            if child.type == "expression_statement":
-                for sub_child in child.children:
-                    if sub_child.type == "call_expression":
-                        function_name = sub_child.children[0].text.decode("utf8")
-                        arguments_node = sub_child.children[1]
-                        parameters = extract_arguments(arguments_node)
-                        for key, value in parameters.items():
-                            if isinstance(value, list):
-                                raise Exception("Error: Multiple arguments with the same name are not supported.")
-                        result = [{function_name: parameters}]
-                        return result
-
-
-def ast_parse(input_str, language="Python"):
-    if language == "Python":
-        cleaned_input = input_str.strip("[]'")
-        parsed = ast.parse(cleaned_input, mode="eval")
-        extracted = []
-        if isinstance(parsed.body, ast.Call):
-            extracted.append(resolve_ast_call(parsed.body))
-        else:
-            for elem in parsed.body.elts:
-                extracted.append(resolve_ast_call(elem))
-        return extracted
-    elif language == "Java":
-        return parse_java_function_call(input_str[1:-1])  # Remove the [ and ] from the string
-    elif language == "JavaScript":
-        return parse_javascript_function_call(input_str[1:-1])
-    else:
-        raise NotImplementedError(f"Unsupported language: {language}")
-
-
-def resolve_ast_call(elem):
-    # Handle nested attributes for deeply nested module paths
-    func_parts = []
-    func_part = elem.func
-    while isinstance(func_part, ast.Attribute):
-        func_parts.append(func_part.attr)
-        func_part = func_part.value
-    if isinstance(func_part, ast.Name):
-        func_parts.append(func_part.id)
-    func_name = ".".join(reversed(func_parts))
-    args_dict = {}
-    # Parse when args are simply passed as an unnamed dictionary arg
-    for arg in elem.args:
-        if isinstance(arg, ast.Dict):
-            for key, value in zip(arg.keys, arg.values):
-                if isinstance(key, ast.Constant):
-                    arg_name = key.value
-                output = resolve_ast_by_type(value)
-                args_dict[arg_name] = output
-    for arg in elem.keywords:
-        output = resolve_ast_by_type(arg.value)
-        args_dict[arg.arg] = output
-    return {func_name: args_dict}
-
-
-def resolve_ast_by_type(value):
-    if isinstance(value, ast.Constant):
-        if value.value is Ellipsis:
-            output = "..."
-        else:
-            output = value.value
-    elif isinstance(value, ast.UnaryOp):
-        output = -value.operand.value
-    elif isinstance(value, ast.List):
-        output = [resolve_ast_by_type(v) for v in value.elts]
-    elif isinstance(value, ast.Dict):
-        output = {resolve_ast_by_type(k): resolve_ast_by_type(v) for k, v in zip(value.keys, value.values)}
-    elif isinstance(value, ast.NameConstant):  # Added this condition to handle boolean values
-        output = value.value
-    elif isinstance(value, ast.BinOp):  # Added this condition to handle function calls as arguments
-        output = eval(ast.unparse(value))
-    elif isinstance(value, ast.Name):
-        output = value.id
-    elif isinstance(value, ast.Call):
-        if len(value.keywords) == 0:
-            output = ast.unparse(value)
-        else:
-            output = resolve_ast_call(value)
-    elif isinstance(value, ast.Tuple):
-        output = tuple(resolve_ast_by_type(v) for v in value.elts)
-    elif isinstance(value, ast.Lambda):
-        output = eval(ast.unparse(value.body[0].value))
-    elif isinstance(value, ast.Ellipsis):
-        output = "..."
-    elif isinstance(value, ast.Subscript):
-        try:
-            output = ast.unparse(value.body[0].value)
-        except:
-            output = ast.unparse(value.value) + "[" + ast.unparse(value.slice) + "]"
-    else:
-        raise Exception(f"Unsupported AST type: {type(value)}")
-    return output
-
-
-def decode_ast(result, language="Python"):
-    func = result
-    func = func.replace("\n", "")  # remove new line characters
-    if not func.startswith("["):
-        func = "[" + func
-    if not func.endswith("]"):
-        func = func + "]"
-    decoded_output = ast_parse(func, language)
-    return decoded_output
-
-
-def decode_execute(result):
-    func = result
-    func = func.replace("\n", "")  # remove new line characters
-    if not func.startswith("["):
-        func = "[" + func
-    if not func.endswith("]"):
-        func = func + "]"
-    decode_output = ast_parse(func)
-    execution_list = []
-    for function_call in decode_output:
-        for key, value in function_call.items():
-            execution_list.append(f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})")
-    return execution_list
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/checker.py
@ -1,989 +0,0 @@
-# ruff: noqa
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import json
-import re
-import time
-from typing import Any
-
-# Comment out for now until we actually use the rest checker in evals
-# import requests  # Do not remove this import even though it seems to be unused. It's used in the executable_checker_rest function.
-
-
-class NoAPIKeyError(Exception):
-    def __init__(self):
-        self.message = "❗️Please fill in the API keys in the function_credential_config.json file. If you do not provide the API keys, the executable test category results will be inaccurate."
-        super().__init__(self.message)
-
-
-REAL_TIME_MATCH_ALLOWED_DIFFERENCE = 0.2
-
-
-JAVA_TYPE_CONVERSION = {
-    "byte": int,
-    "short": int,
-    "integer": int,
-    "float": float,
-    "double": float,
-    "long": int,
-    "boolean": bool,
-    "char": str,
-    "Array": list,
-    "ArrayList": list,
-    "Set": set,
-    "HashMap": dict,
-    "Hashtable": dict,
-    "Queue": list,  # this can be `queue.Queue` as well, for simplicity we check with list
-    "Stack": list,
-    "String": str,
-    "any": str,
-}
-
-JS_TYPE_CONVERSION = {
-    "String": str,
-    "integer": int,
-    "float": float,
-    "Bigint": int,
-    "Boolean": bool,
-    "dict": dict,
-    "array": list,
-    "any": str,
-}
-
-# We switch to conditional import for the following two imports to avoid unnecessary installations.
-# User doesn't need to setup the tree-sitter packages if they are not running the test for that language.
-# from js_type_converter import js_type_converter
-# from java_type_converter import java_type_converter
-
-PYTHON_TYPE_MAPPING = {
-    "string": str,
-    "integer": int,
-    "float": float,
-    "boolean": bool,
-    "array": list,
-    "tuple": list,
-    "dict": dict,
-    "any": str,
-}
-
-# This is the list of types that we need to recursively check its values
-PYTHON_NESTED_TYPE_CHECK_LIST = ["array", "tuple"]
-
-
-NESTED_CONVERSION_TYPE_LIST = ["Array", "ArrayList", "array"]
-
-
-#### Helper functions for AST ####
-def find_description(func_descriptions, name):
-    if type(func_descriptions) == list:
-        for func_description in func_descriptions:
-            if func_description["name"] == name:
-                return func_description
-        return None
-    else:
-        # it is a dict, there is only one function
-        return func_descriptions
-
-
-def get_possible_answer_type(possible_answer: list):
-    for answer in possible_answer:
-        if answer != "":  # Optional parameter
-            return type(answer)
-    return None
-
-
-def type_checker(
-    param: str,
-    value,
-    possible_answer: list,
-    expected_type_description: str,
-    expected_type_converted,
-    nested_type_converted,
-):
-    # NOTE: This type checker only supports nested type checking for one level deep.
-    # We didn't implement recursive type checking for nested types, as it's not needed for the current use case and it's very complex.
-
-    result: Any = {
-        "valid": True,
-        "error": [],
-        "is_variable": False,
-        "error_type": "type_error:simple",
-    }
-
-    is_variable = False
-    # check for the case where a variable is used instead of a actual value.
-    # use the type in possible_answer as the expected type
-    possible_answer_type = get_possible_answer_type(possible_answer)
-    # if possible_answer only contains optional parameters, we can't determine the type
-    if possible_answer_type != None:
-        # we are being precise here.
-        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
-        if possible_answer_type != expected_type_converted:
-            is_variable = True
-
-    # value is the same type as in function description
-    if type(value) == expected_type_converted:
-        # We don't need to do recursive check for simple types
-        if nested_type_converted == None:
-            result["is_variable"] = is_variable
-            return result
-        else:
-            for possible_answer_item in possible_answer:
-                flag = True  # Each parameter should match to at least one possible answer type.
-                # Here, we assume that each item should be the same type. We could also relax it.
-                if type(possible_answer_item) == list:
-                    for value_item in value:
-                        checker_result = type_checker(
-                            param,
-                            value_item,
-                            possible_answer_item,
-                            str(nested_type_converted),
-                            nested_type_converted,
-                            None,
-                        )
-                        if not checker_result["valid"]:
-                            flag = False
-                            break
-
-                if flag:
-                    return {"valid": True, "error": [], "is_variable": is_variable}
-
-            result["valid"] = False
-            result["error"] = [
-                f"Nested type checking failed for parameter {repr(param)}. Expected outer type {expected_type_description} with inner type {str(nested_type_converted)}. Parameter value: {repr(value)}."
-            ]
-            result["error_type"] = "type_error:nested"
-
-    # value is not as expected, check for the case where a variable is used instead of a actual value
-    # use the type in possible_answer as the expected type
-    possible_answer_type = get_possible_answer_type(possible_answer)
-    # if possible_answer only contains optional parameters, we can't determine the type
-    if possible_answer_type != None:
-        # we are being precise here.
-        # in fact, possible_answer_type should always be string, as that's how we treat varibale in possible_answer
-        if type(value) == possible_answer_type:
-            result["is_variable"] = True
-            return result
-
-    result["valid"] = False
-    result["error"].append(
-        f"Incorrect type for parameter {repr(param)}. Expected type {expected_type_description}, got {type(value).__name__}. Parameter value: {repr(value)}."
-    )
-    result["error_type"] = "type_error:simple"
-    return result
-
-
-def standardize_string(input_string: str):
-    # This function standardizes the string by removing all the spaces, ",./-_*^" punctuation, and converting it to lowercase
-    # It will also convert all the single quotes to double quotes
-    # This is used to compare the model output with the possible answers
-    # We don't want to punish model for answer like April 1, 2024 vs April 1,2024, vs April 1 2024
-    regex_string = r"[ \,\.\/\-\_\*\^]"
-    return re.sub(regex_string, "", input_string).lower().replace("'", '"')
-
-
-def string_checker(param: str, model_output: str, possible_answer: list):
-    standardize_possible_answer = []
-    standardize_model_output = standardize_string(model_output)
-    for i in range(len(possible_answer)):
-        if type(possible_answer[i]) == str:
-            standardize_possible_answer.append(standardize_string(possible_answer[i]))
-
-    if standardize_model_output not in standardize_possible_answer:
-        return {
-            "valid": False,
-            "error": [
-                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}. Case insensitive."
-            ],
-            "error_type": "value_error:string",
-        }
-
-    return {"valid": True, "error": []}
-
-
-def list_checker(param: str, model_output: list, possible_answer: list):
-    # Convert the tuple to a list
-
-    standardize_model_output = list(model_output)
-
-    # If the element in the list is a string, we need to standardize it
-    for i in range(len(standardize_model_output)):
-        if type(standardize_model_output[i]) == str:
-            standardize_model_output[i] = standardize_string(model_output[i])
-
-    standardize_possible_answer: Any = []
-    # We also need to standardize the possible answers
-    for i in range(len(possible_answer)):
-        standardize_possible_answer.append([])
-        for j in range(len(possible_answer[i])):
-            if type(possible_answer[i][j]) == str:
-                standardize_possible_answer[i].append(standardize_string(possible_answer[i][j]))
-            else:
-                standardize_possible_answer[i].append(possible_answer[i][j])
-
-    if standardize_model_output not in standardize_possible_answer:
-        return {
-            "valid": False,
-            "error": [
-                f"Invalid value for parameter {repr(param)}: {repr(model_output)}. Expected one of {possible_answer}."
-            ],
-            "error_type": "value_error:list/tuple",
-        }
-
-    return {"valid": True, "error": []}
-
-
-def dict_checker(param: str, model_output: dict, possible_answers: list):
-    # This function works for simple dictionaries, but not dictionaries with nested dictionaries.
-    # The current dataset only contains simple dictionaries, so this is sufficient.
-
-    result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
-    for i in range(len(possible_answers)):
-        if possible_answers[i] == "":
-            continue
-
-        result = {"valid": False, "error": [], "error_type": "dict_checker:unclear"}
-
-        flag = True
-
-        possible_answer = possible_answers[i]
-        # possible_anwer is a single dictionary
-
-        for key, value in model_output.items():
-            if key not in possible_answer:
-                result["valid"] = False
-                result["error"].append(f"Unexpected dict key parameter: '{key}'.")  # type: ignore[attr-defined]
-                result["error_type"] = "value_error:dict_key"
-                flag = False
-                break
-
-            standardize_value = value
-            # If the value is a string, we need to standardize it
-            if type(value) == str:
-                standardize_value = standardize_string(value)
-
-            # We also need to standardize the possible answers if they are string
-            standardize_possible_answer = []
-            for i in range(len(possible_answer[key])):
-                if type(possible_answer[key][i]) == str:
-                    standardize_possible_answer.append(standardize_string(possible_answer[key][i]))
-                else:
-                    standardize_possible_answer.append(possible_answer[key][i])
-
-            if standardize_value not in standardize_possible_answer:
-                result["valid"] = False
-                result["error"].append(  # type: ignore[attr-defined]
-                    f"Invalid value for parameter {repr(key)}: {repr(value)}. Expected one of {standardize_possible_answer}."
-                )
-                result["error_type"] = "value_error:dict_value"
-                flag = False
-                break
-
-        for key, value in possible_answer.items():
-            if key not in model_output and "" not in value:
-                result["valid"] = False
-                result["error"].append(f"Missing dict key parameter: '{key}'.")  # type: ignore[attr-defined]
-                result["error_type"] = "value_error:dict_key"
-                flag = False
-                break
-
-        if flag:
-            return {"valid": True, "error": []}
-
-    return result
-
-
-def list_dict_checker(param: str, model_output: list, possible_answers: list):
-    # This function takes in a list of dictionaries and checks if each dictionary is valid
-    # The order of the dictionaries in the list must match the order of the possible answers
-
-    result = {"valid": False, "error": [], "error_type": "list_dict_checker:unclear"}
-
-    for answer_index in range(len(possible_answers)):
-        flag = True  # True means so far, all dictionaries are valid
-
-        # Only proceed if the number of dictionaries in the list matches the number of dictionaries in the possible answers
-        if len(model_output) != len(possible_answers[answer_index]):
-            result["valid"] = False
-            result["error"] = ["Wrong number of dictionaries in the list."]
-            result["error_type"] = "value_error:list_dict_count"
-            flag = False
-            continue
-
-        for dict_index in range(len(model_output)):
-            result = dict_checker(
-                param,
-                model_output[dict_index],
-                [possible_answers[answer_index][dict_index]],
-            )
-            if not result["valid"]:
-                flag = False
-                break
-        if flag:
-            return {"valid": True, "error": []}
-
-    return result
-
-
-def simple_function_checker(
-    func_description: dict,
-    model_output: dict,
-    possible_answer: dict,
-    language: str,
-    model_name: str,
-):
-    possible_answer = list(possible_answer.values())[0]
-    # Extract function name and parameters details
-    func_name = func_description["name"]
-    param_details = func_description["parameters"]["properties"]
-    required_params = func_description["parameters"]["required"]
-
-    # Initialize a result dictionary
-    result = {
-        "valid": True,
-        "error": [],
-        "error_type": "simple_function_checker:unclear",
-    }
-
-    # Check if function name matches
-    if func_name not in model_output:
-        result["valid"] = False
-        result["error"].append(  # type: ignore[attr-defined]
-            f"Function name {repr(func_name)} not found in model output."
-        )
-        result["error_type"] = "simple_function_checker:wrong_func_name"
-        return result
-
-    model_params = model_output[func_name]
-
-    # Check for required parameters in model output
-    for param in required_params:
-        if param not in model_params:
-            result["valid"] = False
-            result["error"].append(f"Missing required parameter: {repr(param)}.")  # type: ignore[attr-defined]
-            result["error_type"] = "simple_function_checker:missing_required"
-            return result
-
-    # Validate types and values for each parameter in model output
-    for param, value in model_params.items():
-        if param not in param_details or param not in possible_answer:
-            result["valid"] = False
-            result["error"].append(f"Unexpected parameter: {repr(param)}.")  # type: ignore[attr-defined]
-            result["error_type"] = "simple_function_checker:unexpected_param"
-            return result
-
-        full_param_details = param_details[param]
-        expected_type_description = full_param_details["type"]  # This is a string
-        is_variable = False
-        nested_type_converted = None
-
-        if language == "Java":
-            from evals.utils.bfcl.java_type_converter import java_type_converter
-
-            expected_type_converted = JAVA_TYPE_CONVERSION[expected_type_description]
-
-            if expected_type_description in JAVA_TYPE_CONVERSION:
-                if type(value) != str:
-                    result["valid"] = False
-                    result["error"].append(  # type: ignore[attr-defined]
-                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
-                    )
-                    result["error_type"] = "type_error:java"
-                    return result
-
-                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
-                    nested_type = param_details[param]["items"]["type"]
-                    nested_type_converted = JAVA_TYPE_CONVERSION[nested_type]
-                    value = java_type_converter(value, expected_type_description, nested_type)
-                else:
-                    value = java_type_converter(value, expected_type_description)
-
-        elif language == "JavaScript":
-            from evals.utils.bfcl.js_type_converter import js_type_converter
-
-            expected_type_converted = JS_TYPE_CONVERSION[expected_type_description]
-
-            if expected_type_description in JS_TYPE_CONVERSION:
-                if type(value) != str:
-                    result["valid"] = False
-                    result["error"].append(  # type: ignore[attr-defined]
-                        f"Incorrect type for parameter {repr(param)}. Expected type String, got {type(value).__name__}. Parameter value: {repr(value)}."
-                    )
-                    result["error_type"] = "type_error:js"
-                    return result
-
-                if expected_type_description in NESTED_CONVERSION_TYPE_LIST:
-                    nested_type = param_details[param]["items"]["type"]
-                    nested_type_converted = JS_TYPE_CONVERSION[nested_type]
-                    value = js_type_converter(value, expected_type_description, nested_type)
-                else:
-                    value = js_type_converter(value, expected_type_description)
-
-        elif language == "Python":
-            expected_type_converted = PYTHON_TYPE_MAPPING[expected_type_description]
-            if expected_type_description in PYTHON_NESTED_TYPE_CHECK_LIST:
-                nested_type = param_details[param]["items"]["type"]
-                nested_type_converted = PYTHON_TYPE_MAPPING[nested_type]
-
-        # We convert all tuple value to list when the expected type is tuple.
-        # The conversion is necessary because any tuple in the possible answer would become a list after being processed through json.dump() and json.load().
-        # This does introduce some false positive (eg, when the model provides a list value instead of tuple). We hope to find a better solution in the future.
-        if expected_type_description == "tuple" and type(value) == tuple:
-            value = list(value)
-
-        # Allow python auto conversion from int to float
-        if language == "Python" and expected_type_description == "float" and type(value) == int:
-            value = float(value)
-
-        # Type checking
-        # In fact, we only check for Python here.
-        # Type check for other languages are handled by the type converter, and so their value (after conversion) is always correct.
-        type_check_result = type_checker(
-            param,
-            value,
-            possible_answer[param],
-            expected_type_description,
-            expected_type_converted,
-            nested_type_converted,
-        )
-        is_variable = type_check_result["is_variable"]
-        if not type_check_result["valid"]:
-            return type_check_result
-
-        # It doesn't make sense to special handle dictionaries and list of dictionaries if the value is a variable.
-        # We can just treat the variable as a string and use the normal flow.
-        if not is_variable:
-            # Special handle for dictionaries
-            if expected_type_converted == dict:
-                result = dict_checker(param, value, possible_answer[param])
-                if not result["valid"]:
-                    return result
-                continue
-
-            # Special handle for list of dictionaries
-            elif expected_type_converted == list and nested_type_converted == dict:
-                result = list_dict_checker(param, value, possible_answer[param])
-                if not result["valid"]:
-                    return result
-                continue
-
-            # Special handle for strings
-            elif expected_type_converted == str:
-                # We don't check for case sensitivity for string, as long as it's not a variable
-                result = string_checker(param, value, possible_answer[param])
-                if not result["valid"]:
-                    return result
-                continue
-
-            elif expected_type_converted == list:
-                result = list_checker(param, value, possible_answer[param])
-                if not result["valid"]:
-                    return result
-                continue
-
-        # Check if the value is within the possible answers
-        if value not in possible_answer[param]:
-            result["valid"] = False
-            result["error"].append(  # type: ignore[attr-defined]
-                f"Invalid value for parameter {repr(param)}: {repr(value)}. Expected one of {possible_answer[param]}."
-            )
-            result["error_type"] = "value_error:others"
-            return result
-
-    # Check for optional parameters not provided but allowed
-    for param in possible_answer:
-        if param not in model_params and "" not in possible_answer[param]:
-            result["valid"] = False
-            result["error"].append(  # type: ignore[attr-defined]
-                f"Optional parameter {repr(param)} not provided and not marked as optional."
-            )
-            result["error_type"] = "simple_function_checker:missing_optional"
-            return result
-
-    return result
-
-
-def parallel_function_checker_enforce_order(
-    func_descriptions: list,
-    model_output: list,
-    possible_answers: dict,
-    language: str,
-    model_name: str,
-):
-    if len(model_output) != len(possible_answers):
-        return {
-            "valid": False,
-            "error": ["Wrong number of functions."],
-            "error_type": "parallel_function_checker_enforce_order:wrong_count",
-        }
-
-    func_name_list = list(possible_answers.keys())
-    possible_answers_list = []
-
-    for key, value in possible_answers.items():
-        possible_answers_list.append({key: value})
-
-    for i in range(len(possible_answers_list)):
-        func_description = find_description(func_descriptions, func_name_list[i])
-
-        result = simple_function_checker(
-            func_description,
-            model_output[i],
-            possible_answers_list[i],
-            language,
-            model_name,
-        )
-        if not result["valid"]:
-            return result
-
-    return {"valid": True, "error": []}
-
-
-def parallel_function_checker_no_order(
-    func_descriptions: list,
-    model_output: list,
-    possible_answers: list,
-    language: str,
-    model_name: str,
-):
-    if len(model_output) != len(possible_answers):
-        return {
-            "valid": False,
-            "error": ["Wrong number of functions."],
-            "error_type": "parallel_function_checker_no_order:wrong_count",
-        }
-
-    matched_indices = []
-
-    # We go throught the possible answers one by one, and eliminate the model output that matches the possible answer
-    # It must be this way because we need ground truth to fetch the correct function description
-    for i in range(len(possible_answers)):
-        # possible_answers[i] is a dictionary with only one key
-        func_name_expected = list(possible_answers[i].keys())[0]
-        func_description = find_description(func_descriptions, func_name_expected)
-
-        all_errors = []
-
-        for index in range(len(model_output)):
-            if index in matched_indices:
-                continue
-
-            result = simple_function_checker(
-                func_description,
-                model_output[index],
-                possible_answers[i],
-                language,
-                model_name,
-            )
-
-            if result["valid"]:
-                matched_indices.append(index)
-                break
-            else:
-                all_errors.append(
-                    {
-                        f"Model Result Index {index}": {
-                            "sub_error": result["error"],
-                            "sub_error_type": result["error_type"],
-                            "model_output_item": model_output[index],
-                            "possible_answer_item": possible_answers[i],
-                        }
-                    }
-                )
-
-        if not result["valid"]:
-            considered_indices = [i for i in range(len(model_output)) if i not in matched_indices]
-            all_errors.insert(
-                0,
-                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
-            )
-            return {
-                "valid": False,
-                "error": all_errors,
-                "error_type": "parallel_function_checker_no_order:cannot_find_match",
-            }
-
-    return {"valid": True, "error": []}
-
-
-def multiple_function_checker(
-    func_descriptions: list,
-    model_output: list,
-    possible_answers: list,
-    language: str,
-    model_name: str,
-):
-    if len(model_output) != len(possible_answers):
-        return {
-            "valid": False,
-            "error": ["Wrong number of functions."],
-            "error_type": "multiple_function_checker:wrong_count",
-        }
-
-    # possible_answers is a list of only one dictionary with only one key
-    func_name_expected = list(possible_answers[0].keys())[0]
-    func_description = find_description(func_descriptions, func_name_expected)
-    return simple_function_checker(
-        func_description,
-        model_output[0],
-        possible_answers[0],
-        language,
-        model_name,
-    )
-
-
-def patten_matcher(exec_output, expected_result, function_call, is_sanity_check):
-    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
-
-    if type(exec_output) != type(expected_result):
-        return {
-            "valid": False,
-            "error": [
-                f"Wrong execution result type for {repr(function_call)}. Expected type: {type(expected_result)}, but got: {type(exec_output)}."
-            ],
-            "error_type": "executable_checker:wrong_result_type",
-            "model_executed_output": exec_output,
-        }
-    if type(exec_output) == dict:
-        # We loose the requirement for the sanity check as the expected result used in the sanity check might not be the most up-to-date one.
-        # This happens when the key is a timestamp or a random number.
-        if is_sanity_check:
-            if len(exec_output) != len(expected_result):
-                return {
-                    "valid": False,
-                    "error": [
-                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
-                    ],
-                    "error_type": "executable_checker:wrong_result_type:dict_length",
-                    "model_executed_output": exec_output,
-                }
-            else:
-                return result
-
-        for key, value in expected_result.items():
-            if key not in exec_output:
-                return {
-                    "valid": False,
-                    "error": [
-                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not found in the model output."
-                    ],
-                    "error_type": "executable_checker:wrong_result_type:dict_key_not_found",
-                    "model_executed_output": exec_output,
-                }
-        for key, value in exec_output.items():
-            if key not in expected_result:
-                return {
-                    "valid": False,
-                    "error": [
-                        f"Wrong execution result pattern for {repr(function_call)}. Expect type Dict, but key {repr(key)} not expected in the model output."
-                    ],
-                    "error_type": "executable_checker:wrong_result_type:dict_extra_key",
-                    "model_executed_output": exec_output,
-                }
-    if type(exec_output) == list:
-        if len(exec_output) != len(expected_result):
-            return {
-                "valid": False,
-                "error": [
-                    f"Wrong execution result pattern for {repr(function_call)}. Expect type list, but wrong number of elements in the output. Expected length: {len(expected_result)}, but got: {len(exec_output)}."
-                ],
-                "error_type": "executable_checker:wrong_result_type:list_length",
-                "model_executed_output": exec_output,
-            }
-    return result
-
-
-#### Helper functions for Exec ####
-def executable_checker_simple(
-    function_call: str,
-    expected_result,
-    expected_result_type: str,
-    is_sanity_check=False,
-):
-    result = {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
-
-    exec_dict: Any = {}
-
-    try:
-        exec(
-            "from executable_python_function import *" + "\nresult=" + function_call,
-            exec_dict,
-        )
-        exec_output = exec_dict["result"]
-    except NoAPIKeyError as e:
-        raise e
-    except Exception as e:
-        result["valid"] = False
-        result["error"].append(  # type: ignore[attr-defined]
-            f"Error in execution: {repr(function_call)}. Error: {str(e)}"
-        )
-        result["error_type"] = "executable_checker:execution_error"
-        return result
-
-    # We need to special handle the case where the execution result is a tuple and convert it to a list
-    # Because when json is stored, the tuple is converted to a list, and so the expected result is a list when loaded from json
-    if isinstance(exec_output, tuple):
-        exec_output = list(exec_output)
-
-    if expected_result_type == "exact_match":
-        if exec_output != expected_result:
-            result["valid"] = False
-            result["error"].append(  # type: ignore[attr-defined]
-                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}."
-            )
-            result["error_type"] = "executable_checker:wrong_result"
-            result["model_executed_output"] = exec_output
-            return result
-
-    elif expected_result_type == "real_time_match":
-        # Allow for 5% difference
-        if (type(expected_result) == float or type(expected_result) == int) and (
-            type(exec_output) == float or type(exec_output) == int
-        ):
-            if not (
-                expected_result * (1 - REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
-                <= exec_output
-                <= expected_result * (1 + REAL_TIME_MATCH_ALLOWED_DIFFERENCE)
-            ):
-                result["valid"] = False
-                result["error"].append(  # type: ignore[attr-defined]
-                    f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. {REAL_TIME_MATCH_ALLOWED_DIFFERENCE * 100}% difference allowed."
-                )
-                result["error_type"] = "executable_checker:wrong_result_real_time"
-                result["model_executed_output"] = exec_output
-                return result
-        else:
-            result["valid"] = False
-            result["error"].append(  # type: ignore[attr-defined]
-                f"Wrong execution result for {repr(function_call)}. Expected: {expected_result}, but got: {exec_output}. Type needs to be float or int for real time match criteria."
-            )
-            result["error_type"] = "executable_checker:wrong_result_real_time"
-            result["model_executed_output"] = exec_output
-            return result
-
-    else:
-        # structural match
-        pattern_match_result = patten_matcher(exec_output, expected_result, function_call, is_sanity_check)
-        if not pattern_match_result["valid"]:
-            return pattern_match_result
-
-    return result
-
-
-def executable_checker_parallel_no_order(
-    decoded_result: list, expected_exec_result: list, expected_exec_result_type: list
-):
-    if len(decoded_result) != len(expected_exec_result):
-        return {
-            "valid": False,
-            "error": [
-                f"Wrong number of functions provided. Expected {len(expected_exec_result)}, but got {len(decoded_result)}."
-            ],
-            "error_type": "value_error:exec_result_count",
-        }
-
-    matched_indices = []
-    for i in range(len(expected_exec_result)):
-        all_errors = []
-        for index in range(len(decoded_result)):
-            if index in matched_indices:
-                continue
-
-            result = executable_checker_simple(
-                decoded_result[index],
-                expected_exec_result[i],
-                expected_exec_result_type[i],
-                False,
-            )
-
-            if result["valid"]:
-                matched_indices.append(index)
-                break
-            else:
-                all_errors.append(
-                    {
-                        f"Model Result Index {index}": {
-                            "sub_error": result["error"],
-                            "sub_error_type": result["error_type"],
-                            "model_executed_output": (
-                                result["model_executed_output"] if "model_executed_output" in result else None
-                            ),
-                        }
-                    }
-                )
-
-        if not result["valid"]:
-            considered_indices = [i for i in range(len(decoded_result)) if i not in matched_indices]
-            all_errors.insert(
-                0,
-                f"Could not find a matching function among index {considered_indices} of model output for index {i} of possible answers.",  # type: ignore[arg-type]
-            )
-            return {
-                "valid": False,
-                "error": all_errors,
-                "error_type": "executable_checker:cannot_find_match",
-            }
-
-    return {"valid": True, "error": [], "error_type": "executable_checker:unclear"}
-
-
-#### Main function ####
-def executable_checker_rest(func_call, idx):
-    # Move this here for now to avoid needing to read this file / fix paths to be relative to dataset_dir. Fix when it's actually needed / used.
-    EVAL_GROUND_TRUTH_PATH = "/mnt/wsfuse/fair_llm_v2/datasets/eval/bfcl/rest-eval-response_v5.jsonl"  # Ground truth file for v5 for rest execution
-    with open(EVAL_GROUND_TRUTH_PATH, "r") as f:
-        EVAL_GROUND_TRUTH = f.readlines()
-    if "https://geocode.maps.co" in func_call:
-        time.sleep(2)
-    if "requests_get" in func_call:
-        func_call = func_call.replace("requests_get", "requests.get")
-    try:
-        response = eval(func_call)
-    except Exception as e:
-        return {
-            "valid": False,
-            "error": [f"Execution failed. {str(e)}"],
-            "error_type": "executable_checker_rest:execution_error",
-        }
-
-    try:
-        if response.status_code == 200:
-            eval_GT_json = json.loads(EVAL_GROUND_TRUTH[idx])
-            try:
-                if isinstance(eval_GT_json, dict):
-                    if isinstance(response.json(), dict):
-                        if set(eval_GT_json.keys()) == set(response.json().keys()):
-                            return {"valid": True, "error": [], "error_type": ""}
-                        return {
-                            "valid": False,
-                            "error": ["Key inconsistency"],
-                            "error_type": "executable_checker_rest:wrong_key",
-                        }
-                    return {
-                        "valid": False,
-                        "error": [f"Expected dictionary, but got {type(response.json())}"],
-                        "error_type": "executable_checker_rest:wrong_type",
-                    }
-
-                elif isinstance(eval_GT_json, list):
-                    if isinstance(response.json(), list):
-                        if len(eval_GT_json) != len(response.json()):
-                            return {
-                                "valid": False,
-                                "error": [f"Response list length inconsistency."],
-                                "error_type": "value_error:exec_result_rest_count",
-                            }
-
-                        else:
-                            for i in range(len(eval_GT_json)):
-                                if set(eval_GT_json[i].keys()) != set(response.json()[i].keys()):
-                                    return {
-                                        "valid": False,
-                                        "error": [f"Key inconsistency"],
-                                        "error_type": "executable_checker_rest:wrong_key",
-                                    }
-
-                            return {"valid": True, "error": []}
-                    else:
-                        return {
-                            "valid": False,
-                            "error": [f"Expected list, but got {type(response.json())}"],
-                            "error_type": "executable_checker_rest:wrong_type",
-                        }
-                return {
-                    "valid": False,
-                    "error": [f"Expected dict or list, but got {type(response.json())}"],
-                    "error_type": "executable_checker_rest:wrong_type",
-                }
-            except Exception as e:
-                return {
-                    "valid": False,
-                    "error": [
-                        f"Error in execution and type checking. Status code: {response.status_code}. Error: {str(e)}"
-                    ],
-                    "error_type": "executable_checker_rest:response_format_error",
-                }
-        else:
-            return {
-                "valid": False,
-                "error": [f"Execution result status code is not 200, got {response.status_code}"],
-                "error_type": "executable_checker_rest:wrong_status_code",
-            }
-    except Exception as e:
-        return {
-            "valid": False,
-            "error": [f"Cannot get status code of the response. Error: {str(e)}"],
-            "error_type": "executable_checker_rest:cannot_get_status_code",
-        }
-
-
-def ast_checker(func_description, model_output, possible_answer, language, test_category, model_name):
-    if "parallel" in test_category:
-        return parallel_function_checker_no_order(func_description, model_output, possible_answer, language, model_name)
-
-    elif "multiple" in test_category:
-        return multiple_function_checker(func_description, model_output, possible_answer, language, model_name)
-
-    else:
-        if len(model_output) != 1:
-            return {
-                "valid": False,
-                "error": ["Wrong number of functions."],
-                "error_type": "simple_function_checker:wrong_count",
-            }
-
-        return simple_function_checker(
-            func_description[0],
-            model_output[0],
-            possible_answer[0],
-            language,
-            model_name,
-        )
-
-
-def exec_checker(decoded_result: list, func_description: dict, test_category: str):
-    if "multiple" in test_category or "parallel" in test_category:
-        return executable_checker_parallel_no_order(
-            decoded_result,
-            func_description["execution_result"],
-            func_description["execution_result_type"],
-        )
-
-    else:
-        if len(decoded_result) != 1:
-            return {
-                "valid": False,
-                "error": ["Wrong number of functions."],
-                "error_type": "simple_exec_checker:wrong_count",
-            }
-        return executable_checker_simple(
-            decoded_result[0],
-            func_description["execution_result"][0],
-            func_description["execution_result_type"][0],
-            False,
-        )
-
-
-def is_empty_output(decoded_output):
-    # This function is a patch to the ast decoder for relevance detection
-    # Sometimes the ast decoder will parse successfully, but the input doens't really have a function call
-    # [], [{}], and anything that is not in function calling format is considered empty (and thus should be marked as correct)
-    if not is_function_calling_format_output(decoded_output):
-        return True
-    if len(decoded_output) == 0:
-        return True
-    if len(decoded_output) == 1 and len(decoded_output[0]) == 0:
-        return True
-
-
-def is_function_calling_format_output(decoded_output):
-    # Ensure the output is a list of dictionaries
-    if type(decoded_output) == list:
-        for item in decoded_output:
-            if type(item) != dict:
-                return False
-        return True
-    return False
--- a/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
+++ b/llama_stack/providers/inline/scoring/basic/utils/bfcl/tree_sitter.py
@ -1,40 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Tree-sitter changes its API with unfortunate frequency. Modules that need it should
-import it from here so that we can centrally manage things as necessary.
-"""
-
-# These currently work with tree-sitter 0.23.0
-# NOTE: Don't import tree-sitter or any of the language modules in the main module
-# because not all environments have them. Import lazily inside functions where needed.
-
-import importlib
-import typing
-
-if typing.TYPE_CHECKING:
-    import tree_sitter
-
-
-def get_language(language: str) -> "tree_sitter.Language":
-    import tree_sitter
-
-    language_module_name = f"tree_sitter_{language}"
-    try:
-        language_module = importlib.import_module(language_module_name)
-    except ModuleNotFoundError as exc:
-        raise ValueError(
-            f"Language {language} is not found. Please install the tree-sitter-{language} package."
-        ) from exc
-    return tree_sitter.Language(language_module.language())
-
-
-def get_parser(language: str, **kwargs) -> "tree_sitter.Parser":
-    import tree_sitter
-
-    lang = get_language(language)
-    return tree_sitter.Parser(lang, **kwargs)
--- a/llama_stack/providers/inline/tool_runtime/rag/init.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/init.py
@ -14,6 +14,6 @@ from .config import RagToolRuntimeConfig
 async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]):
    from .memory import MemoryToolRuntimeImpl

-    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference])
+    impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference], deps[Api.files])
    await impl.initialize()
    return impl
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@ -5,10 +5,15 @@
 # the root directory of this source tree.

 import asyncio
+import base64
+import io
+import mimetypes
 import secrets
 import string
 from typing import Any

+import httpx
+from fastapi import UploadFile
 from pydantic import TypeAdapter

 from llama_stack.apis.common.content_types import (
@ -17,6 +22,7 @@ from llama_stack.apis.common.content_types import (
    InterleavedContentItem,
    TextContentItem,
 )
+from llama_stack.apis.files import Files, OpenAIFilePurpose
 from llama_stack.apis.inference import Inference
 from llama_stack.apis.tools import (
    ListToolDefsResponse,
@ -30,13 +36,18 @@ from llama_stack.apis.tools import (
    ToolParameter,
    ToolRuntime,
 )
-from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO
+from llama_stack.apis.vector_io import (
+    QueryChunksResponse,
+    VectorIO,
+    VectorStoreChunkingStrategyStatic,
+    VectorStoreChunkingStrategyStaticConfig,
+)
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
 from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
 from llama_stack.providers.utils.memory.vector_store import (
    content_from_doc,
-    make_overlapped_chunks,
+    parse_data_url,
 )

 from .config import RagToolRuntimeConfig
@ -55,10 +66,12 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        config: RagToolRuntimeConfig,
        vector_io_api: VectorIO,
        inference_api: Inference,
+        files_api: Files,
    ):
        self.config = config
        self.vector_io_api = vector_io_api
        self.inference_api = inference_api
+        self.files_api = files_api

    async def initialize(self):
        pass
@ -78,27 +91,50 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
        vector_db_id: str,
        chunk_size_in_tokens: int = 512,
    ) -> None:
-        chunks = []
+        if not documents:
+            return
+
        for doc in documents:
-            content = await content_from_doc(doc)
-            # TODO: we should add enrichment here as URLs won't be added to the metadata by default
-            chunks.extend(
-                make_overlapped_chunks(
-                    doc.document_id,
-                    content,
-                    chunk_size_in_tokens,
-                    chunk_size_in_tokens // 4,
-                    doc.metadata,
+            if isinstance(doc.content, URL):
+                if doc.content.uri.startswith("data:"):
+                    parts = parse_data_url(doc.content.uri)
+                    file_data = base64.b64decode(parts["data"]) if parts["is_base64"] else parts["data"].encode()
+                    mime_type = parts["mimetype"]
+                else:
+                    async with httpx.AsyncClient() as client:
+                        response = await client.get(doc.content.uri)
+                        file_data = response.content
+                        mime_type = doc.mime_type or response.headers.get("content-type", "application/octet-stream")
+            else:
+                content_str = await content_from_doc(doc)
+                file_data = content_str.encode("utf-8")
+                mime_type = doc.mime_type or "text/plain"
+
+            file_extension = mimetypes.guess_extension(mime_type) or ".txt"
+            filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
+
+            file_obj = io.BytesIO(file_data)
+            file_obj.name = filename
+
+            upload_file = UploadFile(file=file_obj, filename=filename)
+
+            created_file = await self.files_api.openai_upload_file(
+                file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
+            )
+
+            chunking_strategy = VectorStoreChunkingStrategyStatic(
+                static=VectorStoreChunkingStrategyStaticConfig(
+                    max_chunk_size_tokens=chunk_size_in_tokens,
+                    chunk_overlap_tokens=chunk_size_in_tokens // 4,
                )
            )

-        if not chunks:
-            return
-
-        await self.vector_io_api.insert_chunks(
-            chunks=chunks,
-            vector_db_id=vector_db_id,
-        )
+            await self.vector_io_api.openai_attach_file_to_vector_store(
+                vector_store_id=vector_db_id,
+                file_id=created_file.id,
+                attributes=doc.metadata,
+                chunking_strategy=chunking_strategy,
+            )

    async def query(
        self,
--- a/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
+++ b/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py
@ -30,11 +30,11 @@ from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
 from llama_stack.providers.utils.memory.vector_store import (
    RERANKER_TYPE_RRF,
-    RERANKER_TYPE_WEIGHTED,
    ChunkForDeletion,
    EmbeddingIndex,
    VectorDBWithIndex,
 )
+from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator

 logger = get_logger(name=__name__, category="vector_io")

@ -66,59 +66,6 @@ def _create_sqlite_connection(db_path):
    return connection


-def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
-    """Normalize scores to [0,1] range using min-max normalization."""
-    if not scores:
-        return {}
-    min_score = min(scores.values())
-    max_score = max(scores.values())
-    score_range = max_score - min_score
-    if score_range > 0:
-        return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
-    return dict.fromkeys(scores, 1.0)
-
-
-def _weighted_rerank(
-    vector_scores: dict[str, float],
-    keyword_scores: dict[str, float],
-    alpha: float = 0.5,
-) -> dict[str, float]:
-    """ReRanker that uses weighted average of scores."""
-    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
-    normalized_vector_scores = _normalize_scores(vector_scores)
-    normalized_keyword_scores = _normalize_scores(keyword_scores)
-
-    return {
-        doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
-        + ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
-        for doc_id in all_ids
-    }
-
-
-def _rrf_rerank(
-    vector_scores: dict[str, float],
-    keyword_scores: dict[str, float],
-    impact_factor: float = 60.0,
-) -> dict[str, float]:
-    """ReRanker that uses Reciprocal Rank Fusion."""
-    # Convert scores to ranks
-    vector_ranks = {
-        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
-    }
-    keyword_ranks = {
-        doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
-    }
-
-    all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
-    rrf_scores = {}
-    for doc_id in all_ids:
-        vector_rank = vector_ranks.get(doc_id, float("inf"))
-        keyword_rank = keyword_ranks.get(doc_id, float("inf"))
-        # RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
-        rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
-    return rrf_scores
-
-
 def _make_sql_identifier(name: str) -> str:
    return re.sub(r"[^a-zA-Z0-9_]", "_", name)

@ -398,14 +345,10 @@ class SQLiteVecIndex(EmbeddingIndex):
            for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
        }

-        # Combine scores using the specified reranker
-        if reranker_type == RERANKER_TYPE_WEIGHTED:
-            alpha = reranker_params.get("alpha", 0.5)
-            combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha)
-        else:
-            # Default to RRF for None, RRF, or any unknown types
-            impact_factor = reranker_params.get("impact_factor", 60.0)
-            combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
+        # Combine scores using the reranking utility
+        combined_scores = WeightedInMemoryAggregator.combine_search_results(
+            vector_scores, keyword_scores, reranker_type, reranker_params
+        )

        # Sort by combined score and get top k results
        sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -116,7 +116,7 @@ def available_providers() -> list[ProviderSpec]:
            adapter=AdapterSpec(
                adapter_type="fireworks",
                pip_packages=[
-                    "fireworks-ai",
+                    "fireworks-ai<=0.17.16",
                ],
                module="llama_stack.providers.remote.inference.fireworks",
                config_class="llama_stack.providers.remote.inference.fireworks.FireworksImplConfig",
@ -207,7 +207,7 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="gemini",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.gemini",
                config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
                provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
@ -248,7 +248,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="groq",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.groq",
                config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
                provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
@ -270,7 +270,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="sambanova",
-                pip_packages=["litellm"],
+                pip_packages=["litellm", "openai"],
                module="llama_stack.providers.remote.inference.sambanova",
                config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
                provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
@ -292,7 +292,7 @@ Available Models:
            api=Api.inference,
            adapter=AdapterSpec(
                adapter_type="watsonx",
-                pip_packages=["ibm_watson_machine_learning"],
+                pip_packages=["ibm_watsonx_ai"],
                module="llama_stack.providers.remote.inference.watsonx",
                config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig",
                provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator",
--- a/llama_stack/providers/registry/tool_runtime.py
+++ b/llama_stack/providers/registry/tool_runtime.py
@ -32,7 +32,7 @@ def available_providers() -> list[ProviderSpec]:
            ],
            module="llama_stack.providers.inline.tool_runtime.rag",
            config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
-            api_dependencies=[Api.vector_io, Api.inference],
+            api_dependencies=[Api.vector_io, Api.inference, Api.files],
            description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.",
        ),
        remote_provider_spec(
--- a/llama_stack/providers/registry/vector_io.py
+++ b/llama_stack/providers/registry/vector_io.py
@ -404,6 +404,60 @@ That means you'll get fast and efficient vector retrieval.
 - Easy to use
 - Fully integrated with Llama Stack

+There are three implementations of search for PGVectoIndex available:
+
+1. Vector Search:
+- How it works:
+  - Uses PostgreSQL's vector extension (pgvector) to perform similarity search
+  - Compares query embeddings against stored embeddings using Cosine distance or other distance metrics
+  - Eg. SQL query: SELECT document, embedding <=> %s::vector AS distance FROM table ORDER BY distance
+
+-Characteristics:
+  - Semantic understanding - finds documents similar in meaning even if they don't share keywords
+  - Works with high-dimensional vector embeddings (typically 768, 1024, or higher dimensions)
+  - Best for: Finding conceptually related content, handling synonyms, cross-language search
+
+2. Keyword Search
+- How it works:
+  - Uses PostgreSQL's full-text search capabilities with tsvector and ts_rank
+  - Converts text to searchable tokens using to_tsvector('english', text). Default language is English.
+  - Eg. SQL query: SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
+
+- Characteristics:
+  - Lexical matching - finds exact keyword matches and variations
+  - Uses GIN (Generalized Inverted Index) for fast text search performance
+  - Scoring: Uses PostgreSQL's ts_rank function for relevance scoring
+  - Best for: Exact term matching, proper names, technical terms, Boolean-style queries
+
+3. Hybrid Search
+- How it works:
+  - Combines both vector and keyword search results
+  - Runs both searches independently, then merges results using configurable reranking
+
+- Two reranking strategies available:
+    - Reciprocal Rank Fusion (RRF) - (default: 60.0)
+    - Weighted Average - (default: 0.5)
+
+- Characteristics:
+  - Best of both worlds: semantic understanding + exact matching
+  - Documents appearing in both searches get boosted scores
+  - Configurable balance between semantic and lexical matching
+  - Best for: General-purpose search where you want both precision and recall
+
+4. Database Schema
+The PGVector implementation stores data optimized for all three search types:
+CREATE TABLE vector_store_xxx (
+    id TEXT PRIMARY KEY,
+    document JSONB,                    -- Original document
+    embedding vector(dimension),        -- For vector search
+    content_text TEXT,                 -- Raw text content
+    tokenized_content TSVECTOR          -- For keyword search
+);
+
+-- Indexes for performance
+CREATE INDEX content_gin_idx ON table USING GIN(tokenized_content);  -- Keyword search
+-- Vector index created automatically by pgvector
+
 ## Usage

 To use PGVector in your Llama Stack project, follow these steps:
@ -412,6 +466,25 @@ To use PGVector in your Llama Stack project, follow these steps:
 2. Configure your Llama Stack project to use pgvector. (e.g. remote::pgvector).
 3. Start storing and querying vectors.

+## This is an example how you can set up your environment for using PGVector
+
+1. Export env vars:
+```bash
+export ENABLE_PGVECTOR=true
+export PGVECTOR_HOST=localhost
+export PGVECTOR_PORT=5432
+export PGVECTOR_DB=llamastack
+export PGVECTOR_USER=llamastack
+export PGVECTOR_PASSWORD=llamastack
+```
+
+2. Create DB:
+```bash
+psql -h localhost -U postgres -c "CREATE ROLE llamastack LOGIN PASSWORD 'llamastack';"
+psql -h localhost -U postgres -c "CREATE DATABASE llamastack OWNER llamastack;"
+psql -h localhost -U llamastack -d llamastack -c "CREATE EXTENSION IF NOT EXISTS vector;"
+```
+
 ## Installation

 You can install PGVector using docker:
@ -449,6 +522,7 @@ Weaviate supports:
 - Metadata filtering
 - Multi-modal retrieval

+
 ## Usage

 To use Weaviate in your Llama Stack project, follow these steps:
--- a/llama_stack/providers/remote/files/s3/init.py
+++ b/llama_stack/providers/remote/files/s3/init.py
@ -6,15 +6,14 @@

 from typing import Any

-from llama_stack.core.datatypes import Api
+from llama_stack.core.datatypes import AccessRule, Api

 from .config import S3FilesImplConfig


-async def get_adapter_impl(config: S3FilesImplConfig, deps: dict[Api, Any]):
+async def get_adapter_impl(config: S3FilesImplConfig, deps: dict[Api, Any], policy: list[AccessRule] | None = None):
    from .files import S3FilesImpl

-    # TODO: authorization policies and user separation
-    impl = S3FilesImpl(config)
+    impl = S3FilesImpl(config, policy or [])
    await impl.initialize()
    return impl
--- a/llama_stack/providers/remote/files/s3/files.py
+++ b/llama_stack/providers/remote/files/s3/files.py
@ -4,9 +4,9 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import time
 import uuid
-from typing import Annotated
+from datetime import UTC, datetime
+from typing import Annotated, Any

 import boto3
 from botocore.exceptions import BotoCoreError, ClientError, NoCredentialsError
@ -15,14 +15,17 @@ from fastapi import File, Form, Response, UploadFile
 from llama_stack.apis.common.errors import ResourceNotFoundError
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.files import (
+    ExpiresAfter,
    Files,
    ListOpenAIFileResponse,
    OpenAIFileDeleteResponse,
    OpenAIFileObject,
    OpenAIFilePurpose,
 )
+from llama_stack.core.datatypes import AccessRule
 from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
-from llama_stack.providers.utils.sqlstore.sqlstore import SqlStore, sqlstore_impl
+from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
+from llama_stack.providers.utils.sqlstore.sqlstore import sqlstore_impl

 from .config import S3FilesImplConfig

@ -83,22 +86,85 @@ async def _create_bucket_if_not_exists(client: boto3.client, config: S3FilesImpl
            raise RuntimeError(f"Failed to access S3 bucket '{config.bucket_name}': {e}") from e


+def _make_file_object(
+    *,
+    id: str,
+    filename: str,
+    purpose: str,
+    bytes: int,
+    created_at: int,
+    expires_at: int,
+    **kwargs: Any,  # here to ignore any additional fields, e.g. extra fields from AuthorizedSqlStore
+) -> OpenAIFileObject:
+    """
+    Construct an OpenAIFileObject and normalize expires_at.
+
+    If expires_at is greater than the max we treat it as no-expiration and
+    return None for expires_at.
+
+    The OpenAI spec says expires_at type is Integer, but the implementation
+    will return None for no expiration.
+    """
+    obj = OpenAIFileObject(
+        id=id,
+        filename=filename,
+        purpose=OpenAIFilePurpose(purpose),
+        bytes=bytes,
+        created_at=created_at,
+        expires_at=expires_at,
+    )
+
+    if obj.expires_at is not None and obj.expires_at > (obj.created_at + ExpiresAfter.MAX):
+        obj.expires_at = None  # type: ignore
+
+    return obj
+
+
 class S3FilesImpl(Files):
    """S3-based implementation of the Files API."""

-    # TODO: implement expiration, for now a silly offset
-    _SILLY_EXPIRATION_OFFSET = 100 * 365 * 24 * 60 * 60
-
-    def __init__(self, config: S3FilesImplConfig) -> None:
+    def __init__(self, config: S3FilesImplConfig, policy: list[AccessRule]) -> None:
        self._config = config
+        self.policy = policy
        self._client: boto3.client | None = None
-        self._sql_store: SqlStore | None = None
+        self._sql_store: AuthorizedSqlStore | None = None
+
+    def _now(self) -> int:
+        """Return current UTC timestamp as int seconds."""
+        return int(datetime.now(UTC).timestamp())
+
+    async def _get_file(self, file_id: str, return_expired: bool = False) -> dict[str, Any]:
+        where: dict[str, str | dict] = {"id": file_id}
+        if not return_expired:
+            where["expires_at"] = {">": self._now()}
+        if not (row := await self.sql_store.fetch_one("openai_files", policy=self.policy, where=where)):
+            raise ResourceNotFoundError(file_id, "File", "files.list()")
+        return row
+
+    async def _delete_file(self, file_id: str) -> None:
+        """Delete a file from S3 and the database."""
+        try:
+            self.client.delete_object(
+                Bucket=self._config.bucket_name,
+                Key=file_id,
+            )
+        except ClientError as e:
+            if e.response["Error"]["Code"] != "NoSuchKey":
+                raise RuntimeError(f"Failed to delete file from S3: {e}") from e
+
+        await self.sql_store.delete("openai_files", where={"id": file_id})
+
+    async def _delete_if_expired(self, file_id: str) -> None:
+        """If the file exists and is expired, delete it."""
+        if row := await self._get_file(file_id, return_expired=True):
+            if (expires_at := row.get("expires_at")) and expires_at <= self._now():
+                await self._delete_file(file_id)

    async def initialize(self) -> None:
        self._client = _create_s3_client(self._config)
        await _create_bucket_if_not_exists(self._client, self._config)

-        self._sql_store = sqlstore_impl(self._config.metadata_store)
+        self._sql_store = AuthorizedSqlStore(sqlstore_impl(self._config.metadata_store))
        await self._sql_store.create_table(
            "openai_files",
            {
@ -121,7 +187,7 @@ class S3FilesImpl(Files):
        return self._client

    @property
-    def sql_store(self) -> SqlStore:
+    def sql_store(self) -> AuthorizedSqlStore:
        assert self._sql_store is not None, "Provider not initialized"
        return self._sql_store

@ -129,27 +195,47 @@ class S3FilesImpl(Files):
        self,
        file: Annotated[UploadFile, File()],
        purpose: Annotated[OpenAIFilePurpose, Form()],
+        expires_after_anchor: Annotated[str | None, Form(alias="expires_after[anchor]")] = None,
+        expires_after_seconds: Annotated[int | None, Form(alias="expires_after[seconds]")] = None,
    ) -> OpenAIFileObject:
        file_id = f"file-{uuid.uuid4().hex}"

        filename = getattr(file, "filename", None) or "uploaded_file"

-        created_at = int(time.time())
-        expires_at = created_at + self._SILLY_EXPIRATION_OFFSET
+        created_at = self._now()
+
+        expires_after = None
+        if expires_after_anchor is not None or expires_after_seconds is not None:
+            # we use ExpiresAfter to validate input
+            expires_after = ExpiresAfter(
+                anchor=expires_after_anchor,  # type: ignore[arg-type]
+                seconds=expires_after_seconds,  # type: ignore[arg-type]
+            )
+
+        # the default is no expiration.
+        # to implement no expiration we set an expiration beyond the max.
+        # we'll hide this fact from users when returning the file object.
+        expires_at = created_at + ExpiresAfter.MAX * 42
+        # the default for BATCH files is 30 days, which happens to be the expiration max.
+        if purpose == OpenAIFilePurpose.BATCH:
+            expires_at = created_at + ExpiresAfter.MAX
+
+        if expires_after is not None:
+            expires_at = created_at + expires_after.seconds
+
        content = await file.read()
        file_size = len(content)

-        await self.sql_store.insert(
-            "openai_files",
-            {
-                "id": file_id,
-                "filename": filename,
-                "purpose": purpose.value,
-                "bytes": file_size,
-                "created_at": created_at,
-                "expires_at": expires_at,
-            },
-        )
+        entry: dict[str, Any] = {
+            "id": file_id,
+            "filename": filename,
+            "purpose": purpose.value,
+            "bytes": file_size,
+            "created_at": created_at,
+            "expires_at": expires_at,
+        }
+
+        await self.sql_store.insert("openai_files", entry)

        try:
            self.client.put_object(
@ -163,14 +249,7 @@ class S3FilesImpl(Files):

            raise RuntimeError(f"Failed to upload file to S3: {e}") from e

-        return OpenAIFileObject(
-            id=file_id,
-            filename=filename,
-            purpose=purpose,
-            bytes=file_size,
-            created_at=created_at,
-            expires_at=expires_at,
-        )
+        return _make_file_object(**entry)

    async def openai_list_files(
        self,
@ -183,29 +262,20 @@ class S3FilesImpl(Files):
        if not order:
            order = Order.desc

-        where_conditions = {}
+        where_conditions: dict[str, Any] = {"expires_at": {">": self._now()}}
        if purpose:
            where_conditions["purpose"] = purpose.value

        paginated_result = await self.sql_store.fetch_all(
            table="openai_files",
-            where=where_conditions if where_conditions else None,
+            policy=self.policy,
+            where=where_conditions,
            order_by=[("created_at", order.value)],
            cursor=("id", after) if after else None,
            limit=limit,
        )

-        files = [
-            OpenAIFileObject(
-                id=row["id"],
-                filename=row["filename"],
-                purpose=OpenAIFilePurpose(row["purpose"]),
-                bytes=row["bytes"],
-                created_at=row["created_at"],
-                expires_at=row["expires_at"],
-            )
-            for row in paginated_result.data
-        ]
+        files = [_make_file_object(**row) for row in paginated_result.data]

        return ListOpenAIFileResponse(
            data=files,
@ -216,41 +286,20 @@ class S3FilesImpl(Files):
        )

    async def openai_retrieve_file(self, file_id: str) -> OpenAIFileObject:
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
-        if not row:
-            raise ResourceNotFoundError(file_id, "File", "files.list()")
-
-        return OpenAIFileObject(
-            id=row["id"],
-            filename=row["filename"],
-            purpose=OpenAIFilePurpose(row["purpose"]),
-            bytes=row["bytes"],
-            created_at=row["created_at"],
-            expires_at=row["expires_at"],
-        )
+        await self._delete_if_expired(file_id)
+        row = await self._get_file(file_id)
+        return _make_file_object(**row)

    async def openai_delete_file(self, file_id: str) -> OpenAIFileDeleteResponse:
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
-        if not row:
-            raise ResourceNotFoundError(file_id, "File", "files.list()")
-
-        try:
-            self.client.delete_object(
-                Bucket=self._config.bucket_name,
-                Key=row["id"],
-            )
-        except ClientError as e:
-            if e.response["Error"]["Code"] != "NoSuchKey":
-                raise RuntimeError(f"Failed to delete file from S3: {e}") from e
-
-        await self.sql_store.delete("openai_files", where={"id": file_id})
-
+        await self._delete_if_expired(file_id)
+        _ = await self._get_file(file_id)  # raises if not found
+        await self._delete_file(file_id)
        return OpenAIFileDeleteResponse(id=file_id, deleted=True)

    async def openai_retrieve_file_content(self, file_id: str) -> Response:
-        row = await self.sql_store.fetch_one("openai_files", where={"id": file_id})
-        if not row:
-            raise ResourceNotFoundError(file_id, "File", "files.list()")
+        await self._delete_if_expired(file_id)
+
+        row = await self._get_file(file_id)

        try:
            response = self.client.get_object(
@ -261,7 +310,7 @@ class S3FilesImpl(Files):
            content = response["Body"].read()
        except ClientError as e:
            if e.response["Error"]["Code"] == "NoSuchKey":
-                await self.sql_store.delete("openai_files", where={"id": file_id})
+                await self._delete_file(file_id)
                raise ResourceNotFoundError(file_id, "File", "files.list()") from e
            raise RuntimeError(f"Failed to download file from S3: {e}") from e

--- a/llama_stack/providers/remote/inference/gemini/gemini.py
+++ b/llama_stack/providers/remote/inference/gemini/gemini.py
@ -5,12 +5,13 @@
 # the root directory of this source tree.

 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import GeminiConfig
 from .models import MODEL_ENTRIES


-class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
+class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    def __init__(self, config: GeminiConfig) -> None:
        LiteLLMOpenAIMixin.__init__(
            self,
@ -21,6 +22,11 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config

+    get_api_key = LiteLLMOpenAIMixin.get_api_key
+
+    def get_base_url(self):
+        return "https://generativelanguage.googleapis.com/v1beta/openai/"
+
    async def initialize(self) -> None:
        await super().initialize()

--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@ -4,30 +4,15 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from collections.abc import AsyncIterator
-from typing import Any

-from openai import AsyncOpenAI
-
-from llama_stack.apis.inference import (
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAIChoiceDelta,
-    OpenAIChunkChoice,
-    OpenAIMessageParam,
-    OpenAIResponseFormatParam,
-    OpenAISystemMessageParam,
-)
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
-from llama_stack.providers.utils.inference.openai_compat import (
-    prepare_openai_completion_params,
-)
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .models import MODEL_ENTRIES


-class GroqInferenceAdapter(LiteLLMOpenAIMixin):
+class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
    _config: GroqConfig

    def __init__(self, config: GroqConfig):
@ -40,122 +25,14 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
        )
        self.config = config

+    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
+    get_api_key = LiteLLMOpenAIMixin.get_api_key
+
+    def get_base_url(self) -> str:
+        return f"{self.config.url}/openai/v1"
+
    async def initialize(self):
        await super().initialize()

    async def shutdown(self):
        await super().shutdown()
-
-    def _get_openai_client(self) -> AsyncOpenAI:
-        return AsyncOpenAI(
-            base_url=f"{self.config.url}/openai/v1",
-            api_key=self.get_api_key(),
-        )
-
-    async def openai_chat_completion(
-        self,
-        model: str,
-        messages: list[OpenAIMessageParam],
-        frequency_penalty: float | None = None,
-        function_call: str | dict[str, Any] | None = None,
-        functions: list[dict[str, Any]] | None = None,
-        logit_bias: dict[str, float] | None = None,
-        logprobs: bool | None = None,
-        max_completion_tokens: int | None = None,
-        max_tokens: int | None = None,
-        n: int | None = None,
-        parallel_tool_calls: bool | None = None,
-        presence_penalty: float | None = None,
-        response_format: OpenAIResponseFormatParam | None = None,
-        seed: int | None = None,
-        stop: str | list[str] | None = None,
-        stream: bool | None = None,
-        stream_options: dict[str, Any] | None = None,
-        temperature: float | None = None,
-        tool_choice: str | dict[str, Any] | None = None,
-        tools: list[dict[str, Any]] | None = None,
-        top_logprobs: int | None = None,
-        top_p: float | None = None,
-        user: str | None = None,
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        model_obj = await self.model_store.get_model(model)
-
-        # Groq does not support json_schema response format, so we need to convert it to json_object
-        if response_format and response_format.type == "json_schema":
-            response_format.type = "json_object"
-            schema = response_format.json_schema.get("schema", {})
-            response_format.json_schema = None
-            json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
-            if messages and messages[0].role == "system":
-                messages[0].content = messages[0].content + json_instructions
-            else:
-                messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
-
-        # Groq returns a 400 error if tools are provided but none are called
-        # So, set tool_choice to "required" to attempt to force a call
-        if tools and (not tool_choice or tool_choice == "auto"):
-            tool_choice = "required"
-
-        params = await prepare_openai_completion_params(
-            model=model_obj.provider_resource_id,
-            messages=messages,
-            frequency_penalty=frequency_penalty,
-            function_call=function_call,
-            functions=functions,
-            logit_bias=logit_bias,
-            logprobs=logprobs,
-            max_completion_tokens=max_completion_tokens,
-            max_tokens=max_tokens,
-            n=n,
-            parallel_tool_calls=parallel_tool_calls,
-            presence_penalty=presence_penalty,
-            response_format=response_format,
-            seed=seed,
-            stop=stop,
-            stream=stream,
-            stream_options=stream_options,
-            temperature=temperature,
-            tool_choice=tool_choice,
-            tools=tools,
-            top_logprobs=top_logprobs,
-            top_p=top_p,
-            user=user,
-        )
-
-        # Groq does not support streaming requests that set response_format
-        fake_stream = False
-        if stream and response_format:
-            params["stream"] = False
-            fake_stream = True
-
-        response = await self._get_openai_client().chat.completions.create(**params)
-
-        if fake_stream:
-            chunk_choices = []
-            for choice in response.choices:
-                delta = OpenAIChoiceDelta(
-                    content=choice.message.content,
-                    role=choice.message.role,
-                    tool_calls=choice.message.tool_calls,
-                )
-                chunk_choice = OpenAIChunkChoice(
-                    delta=delta,
-                    finish_reason=choice.finish_reason,
-                    index=choice.index,
-                    logprobs=None,
-                )
-                chunk_choices.append(chunk_choice)
-            chunk = OpenAIChatCompletionChunk(
-                id=response.id,
-                choices=chunk_choices,
-                object="chat.completion.chunk",
-                created=response.created,
-                model=response.model,
-            )
-
-            async def _fake_stream_generator():
-                yield chunk
-
-            return _fake_stream_generator()
-        else:
-            return response
--- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -41,10 +41,10 @@ client.initialize()

 ### Create Completion

-> Note on Completion API
->
-> The hosted NVIDIA Llama NIMs (e.g., `meta-llama/Llama-3.1-8B-Instruct`) with ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` does not support the ```completion``` method, while the locally deployed NIM does.
+The following example shows how to create a completion for an NVIDIA NIM.

+> [!NOTE]
+> The hosted NVIDIA Llama NIMs (for example ```meta-llama/Llama-3.1-8B-Instruct```) that have ```NVIDIA_BASE_URL="https://integrate.api.nvidia.com"``` do not support the ```completion``` method, while locally deployed NIMs do.

 ```python
 response = client.inference.completion(
@ -60,6 +60,8 @@ print(f"Response: {response.content}")

 ### Create Chat Completion

+The following example shows how to create a chat completion for an NVIDIA NIM.
+
 ```python
 response = client.inference.chat_completion(
    model_id="meta-llama/Llama-3.1-8B-Instruct",
@ -82,6 +84,9 @@ print(f"Response: {response.completion_message.content}")
 ```

 ### Tool Calling Example ###
+
+The following example shows how to do tool calling for an NVIDIA NIM.
+
 ```python
 from llama_stack.models.llama.datatypes import ToolDefinition, ToolParamDefinition

@ -117,6 +122,9 @@ if tool_response.completion_message.tool_calls:
 ```

 ### Structured Output Example
+
+The following example shows how to do structured output for an NVIDIA NIM.
+
 ```python
 from llama_stack.apis.inference import JsonSchemaResponseFormat, ResponseFormatType

@ -149,8 +157,10 @@ print(f"Structured Response: {structured_response.completion_message.content}")
 ```

 ### Create Embeddings
-> Note on OpenAI embeddings compatibility
->
+
+The following example shows how to create embeddings for an NVIDIA NIM.
+
+> [!NOTE]
 > NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.

 ```python
@ -160,4 +170,42 @@ response = client.inference.embeddings(
    task_type="query",
 )
 print(f"Embeddings: {response.embeddings}")
-```
+```
+
+### Vision Language Models Example
+
+The following example shows how to run vision inference by using an NVIDIA NIM.
+
+```python
+def load_image_as_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        img_bytes = image_file.read()
+        return base64.b64encode(img_bytes).decode("utf-8")
+
+
+image_path = {path_to_the_image}
+demo_image_b64 = load_image_as_base64(image_path)
+
+vlm_response = client.inference.chat_completion(
+    model_id="nvidia/vila",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": {
+                        "data": demo_image_b64,
+                    },
+                },
+                {
+                    "type": "text",
+                    "text": "Please describe what you see in this image in detail.",
+                },
+            ],
+        }
+    ],
+)
+
+print(f"VLM Response: {vlm_response.completion_message.content}")
+```
--- a/llama_stack/providers/remote/inference/nvidia/models.py
+++ b/llama_stack/providers/remote/inference/nvidia/models.py
@ -55,6 +55,10 @@ MODEL_ENTRIES = [
        "meta/llama-3.3-70b-instruct",
        CoreModelId.llama3_3_70b_instruct.value,
    ),
+    ProviderModelEntry(
+        provider_model_id="nvidia/vila",
+        model_type=ModelType.llm,
+    ),
    # NeMo Retriever Text Embedding models -
    #
    # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@ -118,10 +118,10 @@ class OllamaInferenceAdapter(

    async def initialize(self) -> None:
        logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
-        health_response = await self.health()
-        if health_response["status"] == HealthStatus.ERROR:
+        r = await self.health()
+        if r["status"] == HealthStatus.ERROR:
            logger.warning(
-                "Ollama Server is not running, make sure to start it using `ollama serve` in a separate terminal"
+                f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
            )

    async def should_refresh_models(self) -> bool:
@ -156,7 +156,7 @@ class OllamaInferenceAdapter(
            ),
            Model(
                identifier="nomic-embed-text",
-                provider_resource_id="nomic-embed-text",
+                provider_resource_id="nomic-embed-text:latest",
                provider_id=provider_id,
                metadata={
                    "embedding_dimension": 768,
--- a/llama_stack/providers/remote/inference/sambanova/sambanova.py
+++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py
@ -4,13 +4,26 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin

 from .config import SambaNovaImplConfig
 from .models import MODEL_ENTRIES


-class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
+class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
+    """
+    SambaNova Inference Adapter for Llama Stack.
+
+    Note: The inheritance order is important here. OpenAIMixin must come before
+    LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
+    is used instead of LiteLLMOpenAIMixin.check_model_availability().
+
+    - OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists
+    - LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM
+    """
+
    def __init__(self, config: SambaNovaImplConfig):
        self.config = config
        self.environment_available_models = []
@ -24,3 +37,14 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
            download_images=True,  # SambaNova requires base64 image encoding
            json_schema_strict=False,  # SambaNova doesn't support strict=True yet
        )
+
+    # Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
+    get_api_key = LiteLLMOpenAIMixin.get_api_key
+
+    def get_base_url(self) -> str:
+        """
+        Get the base URL for OpenAI mixin.
+
+        :return: The SambaNova base URL
+        """
+        return self.config.url
--- a/llama_stack/providers/remote/inference/watsonx/watsonx.py
+++ b/llama_stack/providers/remote/inference/watsonx/watsonx.py
@ -7,8 +7,8 @@
 from collections.abc import AsyncGenerator, AsyncIterator
 from typing import Any

-from ibm_watson_machine_learning.foundation_models import Model
-from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
+from ibm_watsonx_ai.foundation_models import Model
+from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
 from openai import AsyncOpenAI

 from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem
--- a/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
+++ b/llama_stack/providers/remote/vector_io/pgvector/pgvector.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import heapq
 from typing import Any

 import psycopg2
@ -23,6 +24,9 @@ from llama_stack.apis.vector_io import (
 )
 from llama_stack.log import get_logger
 from llama_stack.providers.datatypes import Api, VectorDBsProtocolPrivate
+from llama_stack.providers.utils.inference.prompt_adapter import (
+    interleaved_content_as_str,
+)
 from llama_stack.providers.utils.kvstore import kvstore_impl
 from llama_stack.providers.utils.kvstore.api import KVStore
 from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
@ -31,6 +35,7 @@ from llama_stack.providers.utils.memory.vector_store import (
    EmbeddingIndex,
    VectorDBWithIndex,
 )
+from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator, sanitize_collection_name

 from .config import PGVectorVectorIOConfig

@ -72,25 +77,63 @@ def load_models(cur, cls):


 class PGVectorIndex(EmbeddingIndex):
-    def __init__(self, vector_db: VectorDB, dimension: int, conn, kvstore: KVStore | None = None):
-        self.conn = conn
-        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            # Sanitize the table name by replacing hyphens with underscores
-            # SQL doesn't allow hyphens in table names, and vector_db.identifier may contain hyphens
-            # when created with patterns like "test-vector-db-{uuid4()}"
-            sanitized_identifier = vector_db.identifier.replace("-", "_")
-            self.table_name = f"vector_store_{sanitized_identifier}"
-            self.kvstore = kvstore
+    # reference: https://github.com/pgvector/pgvector?tab=readme-ov-file#querying
+    PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION: dict[str, str] = {
+        "L2": "<->",
+        "L1": "<+>",
+        "COSINE": "<=>",
+        "INNER_PRODUCT": "<#>",
+        "HAMMING": "<~>",
+        "JACCARD": "<%>",
+    }

-            cur.execute(
-                f"""
-                CREATE TABLE IF NOT EXISTS {self.table_name} (
-                    id TEXT PRIMARY KEY,
-                    document JSONB,
-                    embedding vector({dimension})
+    def __init__(
+        self,
+        vector_db: VectorDB,
+        dimension: int,
+        conn: psycopg2.extensions.connection,
+        kvstore: KVStore | None = None,
+        distance_metric: str = "COSINE",
+    ):
+        self.vector_db = vector_db
+        self.dimension = dimension
+        self.conn = conn
+        self.kvstore = kvstore
+        self.check_distance_metric_availability(distance_metric)
+        self.distance_metric = distance_metric
+        self.table_name = None
+
+    async def initialize(self) -> None:
+        try:
+            with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+                # Sanitize the table name by replacing hyphens with underscores
+                # SQL doesn't allow hyphens in table names, and vector_db.identifier may contain hyphens
+                # when created with patterns like "test-vector-db-{uuid4()}"
+                sanitized_identifier = sanitize_collection_name(self.vector_db.identifier)
+                self.table_name = f"vs_{sanitized_identifier}"
+
+                cur.execute(
+                    f"""
+                    CREATE TABLE IF NOT EXISTS {self.table_name} (
+                        id TEXT PRIMARY KEY,
+                        document JSONB,
+                        embedding vector({self.dimension}),
+                        content_text TEXT,
+                        tokenized_content TSVECTOR
+                    )
+                """
                )
-            """
-            )
+
+                # Create GIN index for full-text search performance
+                cur.execute(
+                    f"""
+                    CREATE INDEX IF NOT EXISTS {self.table_name}_content_gin_idx
+                    ON {self.table_name} USING GIN(tokenized_content)
+                """
+                )
+        except Exception as e:
+            log.exception(f"Error creating PGVectorIndex for vector_db: {self.vector_db.identifier}")
+            raise RuntimeError(f"Error creating PGVectorIndex for vector_db: {self.vector_db.identifier}") from e

    async def add_chunks(self, chunks: list[Chunk], embeddings: NDArray):
        assert len(chunks) == len(embeddings), (
@ -99,29 +142,49 @@ class PGVectorIndex(EmbeddingIndex):

        values = []
        for i, chunk in enumerate(chunks):
+            content_text = interleaved_content_as_str(chunk.content)
            values.append(
                (
                    f"{chunk.chunk_id}",
                    Json(chunk.model_dump()),
                    embeddings[i].tolist(),
+                    content_text,
+                    content_text,  # Pass content_text twice - once for content_text column, once for to_tsvector function. Eg. to_tsvector(content_text) = tokenized_content
                )
            )

        query = sql.SQL(
            f"""
-        INSERT INTO {self.table_name} (id, document, embedding)
+        INSERT INTO {self.table_name} (id, document, embedding, content_text, tokenized_content)
        VALUES %s
-        ON CONFLICT (id) DO UPDATE SET embedding = EXCLUDED.embedding, document = EXCLUDED.document
+        ON CONFLICT (id) DO UPDATE SET
+            embedding = EXCLUDED.embedding,
+            document = EXCLUDED.document,
+            content_text = EXCLUDED.content_text,
+            tokenized_content = EXCLUDED.tokenized_content
    """
        )
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-            execute_values(cur, query, values, template="(%s, %s, %s::vector)")
+            execute_values(cur, query, values, template="(%s, %s, %s::vector, %s, to_tsvector('english', %s))")

    async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
+        """
+        Performs vector similarity search using PostgreSQL's search function. Default distance metric is COSINE.
+
+        Args:
+            embedding: The query embedding vector
+            k: Number of results to return
+            score_threshold: Minimum similarity score threshold
+
+        Returns:
+            QueryChunksResponse with combined results
+        """
+        pgvector_search_function = self.get_pgvector_search_function()
+
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(
                f"""
-            SELECT document, embedding <-> %s::vector AS distance
+            SELECT document, embedding {pgvector_search_function} %s::vector AS distance
            FROM {self.table_name}
            ORDER BY distance
            LIMIT %s
@ -147,7 +210,40 @@ class PGVectorIndex(EmbeddingIndex):
        k: int,
        score_threshold: float,
    ) -> QueryChunksResponse:
-        raise NotImplementedError("Keyword search is not supported in PGVector")
+        """
+        Performs keyword-based search using PostgreSQL's full-text search with ts_rank scoring.
+
+        Args:
+            query_string: The text query for keyword search
+            k: Number of results to return
+            score_threshold: Minimum similarity score threshold
+
+        Returns:
+            QueryChunksResponse with combined results
+        """
+        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            # Use plainto_tsquery to handle user input safely and ts_rank for relevance scoring
+            cur.execute(
+                f"""
+            SELECT document, ts_rank(tokenized_content, plainto_tsquery('english', %s)) AS score
+            FROM {self.table_name}
+            WHERE tokenized_content @@ plainto_tsquery('english', %s)
+            ORDER BY score DESC
+            LIMIT %s
+        """,
+                (query_string, query_string, k),
+            )
+            results = cur.fetchall()
+
+            chunks = []
+            scores = []
+            for doc, score in results:
+                if score < score_threshold:
+                    continue
+                chunks.append(Chunk(**doc))
+                scores.append(float(score))
+
+            return QueryChunksResponse(chunks=chunks, scores=scores)

    async def query_hybrid(
        self,
@ -158,7 +254,59 @@ class PGVectorIndex(EmbeddingIndex):
        reranker_type: str,
        reranker_params: dict[str, Any] | None = None,
    ) -> QueryChunksResponse:
-        raise NotImplementedError("Hybrid search is not supported in PGVector")
+        """
+        Hybrid search combining vector similarity and keyword search using configurable reranking.
+
+        Args:
+            embedding: The query embedding vector
+            query_string: The text query for keyword search
+            k: Number of results to return
+            score_threshold: Minimum similarity score threshold
+            reranker_type: Type of reranker to use ("rrf" or "weighted")
+            reranker_params: Parameters for the reranker
+
+        Returns:
+            QueryChunksResponse with combined results
+        """
+        if reranker_params is None:
+            reranker_params = {}
+
+        # Get results from both search methods
+        vector_response = await self.query_vector(embedding, k, score_threshold)
+        keyword_response = await self.query_keyword(query_string, k, score_threshold)
+
+        # Convert responses to score dictionaries using chunk_id
+        vector_scores = {
+            chunk.chunk_id: score for chunk, score in zip(vector_response.chunks, vector_response.scores, strict=False)
+        }
+        keyword_scores = {
+            chunk.chunk_id: score
+            for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
+        }
+
+        # Combine scores using the reranking utility
+        combined_scores = WeightedInMemoryAggregator.combine_search_results(
+            vector_scores, keyword_scores, reranker_type, reranker_params
+        )
+
+        # Efficient top-k selection because it only tracks the k best candidates it's seen so far
+        top_k_items = heapq.nlargest(k, combined_scores.items(), key=lambda x: x[1])
+
+        # Filter by score threshold
+        filtered_items = [(doc_id, score) for doc_id, score in top_k_items if score >= score_threshold]
+
+        # Create a map of chunk_id to chunk for both responses
+        chunk_map = {c.chunk_id: c for c in vector_response.chunks + keyword_response.chunks}
+
+        # Use the map to look up chunks by their IDs
+        chunks = []
+        scores = []
+        for doc_id, score in filtered_items:
+            if doc_id in chunk_map:
+                chunks.append(chunk_map[doc_id])
+                scores.append(score)
+
+        return QueryChunksResponse(chunks=chunks, scores=scores)

    async def delete(self):
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
@ -170,6 +318,25 @@ class PGVectorIndex(EmbeddingIndex):
        with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
            cur.execute(f"DELETE FROM {self.table_name} WHERE id = ANY(%s)", (chunk_ids,))

+    def get_pgvector_search_function(self) -> str:
+        return self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION[self.distance_metric]
+
+    def check_distance_metric_availability(self, distance_metric: str) -> None:
+        """Check if the distance metric is supported by PGVector.
+
+        Args:
+            distance_metric: The distance metric to check
+
+        Raises:
+            ValueError: If the distance metric is not supported
+        """
+        if distance_metric not in self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION:
+            supported_metrics = list(self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION.keys())
+            raise ValueError(
+                f"Distance metric '{distance_metric}' is not supported by PGVector. "
+                f"Supported metrics are: {', '.join(supported_metrics)}"
+            )
+

 class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
    def __init__(
@ -185,8 +352,8 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        self.files_api = files_api
        self.kvstore: KVStore | None = None
        self.vector_db_store = None
-        self.openai_vector_store: dict[str, dict[str, Any]] = {}
-        self.metadatadata_collection_name = "openai_vector_stores_metadata"
+        self.openai_vector_stores: dict[str, dict[str, Any]] = {}
+        self.metadata_collection_name = "openai_vector_stores_metadata"

    async def initialize(self) -> None:
        log.info(f"Initializing PGVector memory adapter with config: {self.config}")
@ -233,9 +400,13 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        upsert_models(self.conn, [(vector_db.identifier, vector_db)])

        # Create and cache the PGVector index table for the vector DB
+        pgvector_index = PGVectorIndex(
+            vector_db=vector_db, dimension=vector_db.embedding_dimension, conn=self.conn, kvstore=self.kvstore
+        )
+        await pgvector_index.initialize()
        index = VectorDBWithIndex(
            vector_db,
-            index=PGVectorIndex(vector_db, vector_db.embedding_dimension, self.conn, kvstore=self.kvstore),
+            index=pgvector_index,
            inference_api=self.inference_api,
        )
        self.cache[vector_db.identifier] = index
@ -272,8 +443,15 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtoco
        if vector_db_id in self.cache:
            return self.cache[vector_db_id]

+        if self.vector_db_store is None:
+            raise VectorStoreNotFoundError(vector_db_id)
+
        vector_db = await self.vector_db_store.get_vector_db(vector_db_id)
+        if not vector_db:
+            raise VectorStoreNotFoundError(vector_db_id)
+
        index = PGVectorIndex(vector_db, vector_db.embedding_dimension, self.conn)
+        await index.initialize()
        self.cache[vector_db_id] = VectorDBWithIndex(vector_db, index, self.inference_api)
        return self.cache[vector_db_id]

--- a/llama_stack/providers/utils/bedrock/config.py
+++ b/llama_stack/providers/utils/bedrock/config.py
@ -4,53 +4,55 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import os
+
 from pydantic import BaseModel, Field


 class BedrockBaseConfig(BaseModel):
    aws_access_key_id: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_ACCESS_KEY_ID"),
        description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID",
    )
    aws_secret_access_key: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_SECRET_ACCESS_KEY"),
        description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY",
    )
    aws_session_token: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_SESSION_TOKEN"),
        description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN",
    )
    region_name: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_DEFAULT_REGION"),
        description="The default AWS Region to use, for example, us-west-1 or us-west-2."
        "Default use environment variable: AWS_DEFAULT_REGION",
    )
    profile_name: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_PROFILE"),
        description="The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE",
    )
    total_max_attempts: int | None = Field(
-        default=None,
+        default_factory=lambda: int(val) if (val := os.getenv("AWS_MAX_ATTEMPTS")) else None,
        description="An integer representing the maximum number of attempts that will be made for a single request, "
        "including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS",
    )
    retry_mode: str | None = Field(
-        default=None,
+        default_factory=lambda: os.getenv("AWS_RETRY_MODE"),
        description="A string representing the type of retries Boto3 will perform."
        "Default use environment variable: AWS_RETRY_MODE",
    )
    connect_timeout: float | None = Field(
-        default=60,
+        default_factory=lambda: float(os.getenv("AWS_CONNECT_TIMEOUT", "60")),
        description="The time in seconds till a timeout exception is thrown when attempting to make a connection. "
        "The default is 60 seconds.",
    )
    read_timeout: float | None = Field(
-        default=60,
+        default_factory=lambda: float(os.getenv("AWS_READ_TIMEOUT", "60")),
        description="The time in seconds till a timeout exception is thrown when attempting to read from a connection."
        "The default is 60 seconds.",
    )
    session_ttl: int | None = Field(
-        default=3600,
+        default_factory=lambda: int(os.getenv("AWS_SESSION_TTL", "3600")),
        description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
    )

--- a/llama_stack/providers/utils/inference/embedding_mixin.py
+++ b/llama_stack/providers/utils/inference/embedding_mixin.py
@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

+import asyncio
 import base64
 import struct
 from typing import TYPE_CHECKING
@ -43,9 +44,11 @@ class SentenceTransformerEmbeddingMixin:
        task_type: EmbeddingTaskType | None = None,
    ) -> EmbeddingsResponse:
        model = await self.model_store.get_model(model_id)
-        embedding_model = self._load_sentence_transformer_model(model.provider_resource_id)
-        embeddings = embedding_model.encode(
-            [interleaved_content_as_str(content) for content in contents], show_progress_bar=False
+        embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id)
+        embeddings = await asyncio.to_thread(
+            embedding_model.encode,
+            [interleaved_content_as_str(content) for content in contents],
+            show_progress_bar=False,
        )
        return EmbeddingsResponse(embeddings=embeddings)

@ -64,8 +67,8 @@ class SentenceTransformerEmbeddingMixin:

        # Get the model and generate embeddings
        model_obj = await self.model_store.get_model(model)
-        embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id)
-        embeddings = embedding_model.encode(input_list, show_progress_bar=False)
+        embedding_model = await self._load_sentence_transformer_model(model_obj.provider_resource_id)
+        embeddings = await asyncio.to_thread(embedding_model.encode, input_list, show_progress_bar=False)

        # Convert embeddings to the requested format
        data = []
@ -93,7 +96,7 @@ class SentenceTransformerEmbeddingMixin:
            usage=usage,
        )

-    def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
+    async def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
        global EMBEDDING_MODELS

        loaded_model = EMBEDDING_MODELS.get(model)
@ -101,8 +104,12 @@ class SentenceTransformerEmbeddingMixin:
            return loaded_model

        log.info(f"Loading sentence transformer for {model}...")
-        from sentence_transformers import SentenceTransformer

-        loaded_model = SentenceTransformer(model)
+        def _load_model():
+            from sentence_transformers import SentenceTransformer
+
+            return SentenceTransformer(model)
+
+        loaded_model = await asyncio.to_thread(_load_model)
        EMBEDDING_MODELS[model] = loaded_model
        return loaded_model
--- a/llama_stack/providers/utils/memory/vector_store.py
+++ b/llama_stack/providers/utils/memory/vector_store.py
@ -294,12 +294,12 @@ class VectorDBWithIndex:
                _validate_embedding(c.embedding, i, self.vector_db.embedding_dimension)

        if chunks_to_embed:
-            resp = await self.inference_api.embeddings(
+            resp = await self.inference_api.openai_embeddings(
                self.vector_db.embedding_model,
                [c.content for c in chunks_to_embed],
            )
-            for c, embedding in zip(chunks_to_embed, resp.embeddings, strict=False):
-                c.embedding = embedding
+            for c, data in zip(chunks_to_embed, resp.data, strict=False):
+                c.embedding = data.embedding

        embeddings = np.array([c.embedding for c in chunks], dtype=np.float32)
        await self.index.add_chunks(chunks, embeddings)
@ -334,8 +334,8 @@ class VectorDBWithIndex:
        if mode == "keyword":
            return await self.index.query_keyword(query_string, k, score_threshold)

-        embeddings_response = await self.inference_api.embeddings(self.vector_db.embedding_model, [query_string])
-        query_vector = np.array(embeddings_response.embeddings[0], dtype=np.float32)
+        embeddings_response = await self.inference_api.openai_embeddings(self.vector_db.embedding_model, [query_string])
+        query_vector = np.array(embeddings_response.data[0].embedding, dtype=np.float32)
        if mode == "hybrid":
            return await self.index.query_hybrid(
                query_vector, query_string, k, score_threshold, reranker_type, reranker_params
--- a/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
+++ b/llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py
@ -23,6 +23,7 @@ from sqlalchemy import (
 )
 from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine
 from sqlalchemy.ext.asyncio.engine import AsyncEngine
+from sqlalchemy.sql.elements import ColumnElement

 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.log import get_logger
@ -43,6 +44,30 @@ TYPE_MAPPING: dict[ColumnType, Any] = {
 }


+def _build_where_expr(column: ColumnElement, value: Any) -> ColumnElement:
+    """Return a SQLAlchemy expression for a where condition.
+
+    `value` may be a simple scalar (equality) or a mapping like {">": 123}.
+    The returned expression is a SQLAlchemy ColumnElement usable in query.where(...).
+    """
+    if isinstance(value, Mapping):
+        if len(value) != 1:
+            raise ValueError(f"Operator mapping must have a single operator, got: {value}")
+        op, operand = next(iter(value.items()))
+        if op == "==" or op == "=":
+            return column == operand
+        if op == ">":
+            return column > operand
+        if op == "<":
+            return column < operand
+        if op == ">=":
+            return column >= operand
+        if op == "<=":
+            return column <= operand
+        raise ValueError(f"Unsupported operator '{op}' in where mapping")
+    return column == value
+
+
 class SqlAlchemySqlStoreImpl(SqlStore):
    def __init__(self, config: SqlAlchemySqlStoreConfig):
        self.config = config
@ -111,7 +136,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):

            if where:
                for key, value in where.items():
-                    query = query.where(table_obj.c[key] == value)
+                    query = query.where(_build_where_expr(table_obj.c[key], value))

            if where_sql:
                query = query.where(text(where_sql))
@ -222,7 +247,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):
        async with self.async_session() as session:
            stmt = self.metadata.tables[table].update()
            for key, value in where.items():
-                stmt = stmt.where(self.metadata.tables[table].c[key] == value)
+                stmt = stmt.where(_build_where_expr(self.metadata.tables[table].c[key], value))
            await session.execute(stmt, data)
            await session.commit()

@ -233,7 +258,7 @@ class SqlAlchemySqlStoreImpl(SqlStore):
        async with self.async_session() as session:
            stmt = self.metadata.tables[table].delete()
            for key, value in where.items():
-                stmt = stmt.where(self.metadata.tables[table].c[key] == value)
+                stmt = stmt.where(_build_where_expr(self.metadata.tables[table].c[key], value))
            await session.execute(stmt)
            await session.commit()

--- a/llama_stack/providers/utils/tools/mcp.py
+++ b/llama_stack/providers/utils/tools/mcp.py
@ -67,6 +67,38 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat
                    raise AuthenticationRequiredError(exc) from exc
            if i == len(connection_strategies) - 1:
                raise
+        except* httpx.ConnectError as eg:
+            # Connection refused, server down, network unreachable
+            if i == len(connection_strategies) - 1:
+                error_msg = f"Failed to connect to MCP server at {endpoint}: Connection refused"
+                logger.error(f"MCP connection error: {error_msg}")
+                raise ConnectionError(error_msg) from eg
+            else:
+                logger.warning(
+                    f"failed to connect to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
+                )
+        except* httpx.TimeoutException as eg:
+            # Request timeout, server too slow
+            if i == len(connection_strategies) - 1:
+                error_msg = f"MCP server at {endpoint} timed out"
+                logger.error(f"MCP timeout error: {error_msg}")
+                raise TimeoutError(error_msg) from eg
+            else:
+                logger.warning(
+                    f"MCP server at {endpoint} timed out via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
+                )
+        except* httpx.RequestError as eg:
+            # DNS resolution failures, network errors, invalid URLs
+            if i == len(connection_strategies) - 1:
+                # Get the first exception's message for the error string
+                exc_msg = str(eg.exceptions[0]) if eg.exceptions else "Unknown error"
+                error_msg = f"Network error connecting to MCP server at {endpoint}: {exc_msg}"
+                logger.error(f"MCP network error: {error_msg}")
+                raise ConnectionError(error_msg) from eg
+            else:
+                logger.warning(
+                    f"network error connecting to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
+                )
        except* McpError:
            if i < len(connection_strategies) - 1:
                logger.warning(
--- a/llama_stack/providers/utils/vector_io/vector_utils.py
+++ b/llama_stack/providers/utils/vector_io/vector_utils.py
@ -37,3 +37,122 @@ def sanitize_collection_name(name: str, weaviate_format=False) -> str:
    else:
        s = proper_case(re.sub(r"[^a-zA-Z0-9]", "", name))
    return s
+
+
+class WeightedInMemoryAggregator:
+    @staticmethod
+    def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
+        """
+        Normalize scores to 0-1 range using min-max normalization.
+
+        Args:
+            scores: dictionary of scores with document IDs as keys and scores as values
+
+        Returns:
+            Normalized scores with document IDs as keys and normalized scores as values
+        """
+        if not scores:
+            return {}
+        min_score, max_score = min(scores.values()), max(scores.values())
+        score_range = max_score - min_score
+        if score_range > 0:
+            return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
+        return dict.fromkeys(scores, 1.0)
+
+    @staticmethod
+    def weighted_rerank(
+        vector_scores: dict[str, float],
+        keyword_scores: dict[str, float],
+        alpha: float = 0.5,
+    ) -> dict[str, float]:
+        """
+        Rerank via weighted average of scores.
+
+        Args:
+            vector_scores: scores from vector search
+            keyword_scores: scores from keyword search
+            alpha: weight factor between 0 and 1 (default: 0.5)
+                   0 = keyword only, 1 = vector only, 0.5 = equal weight
+
+        Returns:
+            All unique document IDs with weighted combined scores
+        """
+        all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
+        normalized_vector_scores = WeightedInMemoryAggregator._normalize_scores(vector_scores)
+        normalized_keyword_scores = WeightedInMemoryAggregator._normalize_scores(keyword_scores)
+
+        # Weighted formula: score = (1-alpha) * keyword_score + alpha * vector_score
+        # alpha=0 means keyword only, alpha=1 means vector only
+        return {
+            doc_id: ((1 - alpha) * normalized_keyword_scores.get(doc_id, 0.0))
+            + (alpha * normalized_vector_scores.get(doc_id, 0.0))
+            for doc_id in all_ids
+        }
+
+    @staticmethod
+    def rrf_rerank(
+        vector_scores: dict[str, float],
+        keyword_scores: dict[str, float],
+        impact_factor: float = 60.0,
+    ) -> dict[str, float]:
+        """
+        Rerank via Reciprocal Rank Fusion.
+
+        Args:
+            vector_scores: scores from vector search
+            keyword_scores: scores from keyword search
+            impact_factor: impact factor for RRF (default: 60.0)
+
+        Returns:
+            All unique document IDs with RRF combined scores
+        """
+
+        # Convert scores to ranks
+        vector_ranks = {
+            doc_id: i + 1
+            for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
+        }
+        keyword_ranks = {
+            doc_id: i + 1
+            for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
+        }
+
+        all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
+        rrf_scores = {}
+        for doc_id in all_ids:
+            vector_rank = vector_ranks.get(doc_id, float("inf"))
+            keyword_rank = keyword_ranks.get(doc_id, float("inf"))
+
+            # RRF formula: score = 1/(k + r) where k is impact_factor (default: 60.0) and r is the rank
+            rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
+        return rrf_scores
+
+    @staticmethod
+    def combine_search_results(
+        vector_scores: dict[str, float],
+        keyword_scores: dict[str, float],
+        reranker_type: str = "rrf",
+        reranker_params: dict[str, float] | None = None,
+    ) -> dict[str, float]:
+        """
+        Combine vector and keyword search results using specified reranking strategy.
+
+        Args:
+            vector_scores: scores from vector search
+            keyword_scores: scores from keyword search
+            reranker_type: type of reranker to use (default: RERANKER_TYPE_RRF)
+            reranker_params: parameters for the reranker
+
+        Returns:
+            All unique document IDs with combined scores
+        """
+        if reranker_params is None:
+            reranker_params = {}
+
+        if reranker_type == "weighted":
+            alpha = reranker_params.get("alpha", 0.5)
+            return WeightedInMemoryAggregator.weighted_rerank(vector_scores, keyword_scores, alpha)
+        else:
+            # Default to RRF for None, RRF, or any unknown types
+            impact_factor = reranker_params.get("impact_factor", 60.0)
+            return WeightedInMemoryAggregator.rrf_rerank(vector_scores, keyword_scores, impact_factor)
--- a/llama_stack/testing/inference_recorder.py
+++ b/llama_stack/testing/inference_recorder.py
@ -30,6 +30,9 @@ from openai.types.completion_choice import CompletionChoice
 CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None
 CompletionChoice.model_rebuild()

+REPO_ROOT = Path(__file__).parent.parent.parent
+DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/recordings"
+

 class InferenceMode(StrEnum):
    LIVE = "live"
@ -51,7 +54,7 @@ def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict


 def get_inference_mode() -> InferenceMode:
-    return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "live").lower())
+    return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "replay").lower())


 def setup_inference_recording():
@ -60,28 +63,18 @@ def setup_inference_recording():
    to increase their reliability and reduce reliance on expensive, external services.

    Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases.
-    Calls to the /models endpoint are not currently trapped. We probably need to add support for this.

-    Two environment variables are required:
-    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'.
-    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in.
+    Two environment variables are supported:
+    - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. Default is 'replay'.
+    - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.

-    The recordings are stored in a SQLite database and a JSON file for each request. The SQLite database is used to
-    quickly find the correct recording for a given request. The JSON files are used to store the request and response
-    bodies.
+    The recordings are stored as JSON files.
    """
    mode = get_inference_mode()
-
-    if mode not in InferenceMode:
-        raise ValueError(f"Invalid LLAMA_STACK_TEST_INFERENCE_MODE: {mode}. Must be 'live', 'record', or 'replay'")
-
    if mode == InferenceMode.LIVE:
        return None

-    if "LLAMA_STACK_TEST_RECORDING_DIR" not in os.environ:
-        raise ValueError("LLAMA_STACK_TEST_RECORDING_DIR must be set for recording or replaying")
-    storage_dir = os.environ["LLAMA_STACK_TEST_RECORDING_DIR"]
-
+    storage_dir = os.environ.get("LLAMA_STACK_TEST_RECORDING_DIR", DEFAULT_STORAGE_DIR)
    return inference_recording(mode=mode, storage_dir=storage_dir)


@ -134,8 +127,8 @@ class ResponseStorage:
    def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
        """Store a request/response pair."""
        # Generate unique response filename
-        response_file = f"{request_hash[:12]}.json"
-        response_path = self.responses_dir / response_file
+        short_hash = request_hash[:12]
+        response_file = f"{short_hash}.json"

        # Serialize response body if needed
        serialized_response = dict(response)
@ -147,6 +140,14 @@ class ResponseStorage:
                # Handle single response
                serialized_response["body"] = _serialize_response(serialized_response["body"])

+        # If this is an Ollama /api/tags recording, include models digest in filename to distinguish variants
+        endpoint = request.get("endpoint")
+        if endpoint in ("/api/tags", "/v1/models"):
+            digest = _model_identifiers_digest(endpoint, response)
+            response_file = f"models-{short_hash}-{digest}.json"
+
+        response_path = self.responses_dir / response_file
+
        # Save response to JSON file
        with open(response_path, "w") as f:
            json.dump({"request": request, "response": serialized_response}, f, indent=2)
@ -161,19 +162,85 @@ class ResponseStorage:
        if not response_path.exists():
            return None

-        with open(response_path) as f:
-            data = json.load(f)
+        return _recording_from_file(response_path)

-        # Deserialize response body if needed
-        if "response" in data and "body" in data["response"]:
-            if isinstance(data["response"]["body"], list):
-                # Handle streaming responses
-                data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]]
+    def _model_list_responses(self, short_hash: str) -> list[dict[str, Any]]:
+        results: list[dict[str, Any]] = []
+        for path in self.responses_dir.glob(f"models-{short_hash}-*.json"):
+            data = _recording_from_file(path)
+            results.append(data)
+        return results
+
+
+def _recording_from_file(response_path) -> dict[str, Any]:
+    with open(response_path) as f:
+        data = json.load(f)
+
+    # Deserialize response body if needed
+    if "response" in data and "body" in data["response"]:
+        if isinstance(data["response"]["body"], list):
+            # Handle streaming responses
+            data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]]
+        else:
+            # Handle single response
+            data["response"]["body"] = _deserialize_response(data["response"]["body"])
+
+    return cast(dict[str, Any], data)
+
+
+def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
+    def _extract_model_identifiers():
+        """Extract a stable set of identifiers for model-list endpoints.
+
+        Supported endpoints:
+        - '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
+        - '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ]
+        Returns a list of unique identifiers or None if structure doesn't match.
+        """
+        body = response["body"]
+        if endpoint == "/api/tags":
+            items = body.get("models")
+            idents = [m.model for m in items]
+        else:
+            items = body.get("data")
+            idents = [m.id for m in items]
+        return sorted(set(idents))
+
+    identifiers = _extract_model_identifiers()
+    return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
+
+
+def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
+    """Return a single, unioned recording for supported model-list endpoints."""
+    seen: dict[str, dict[str, Any]] = {}
+    for rec in records:
+        body = rec["response"]["body"]
+        if endpoint == "/api/tags":
+            items = body.models
+        elif endpoint == "/v1/models":
+            items = body.data
+        else:
+            items = []
+
+        for m in items:
+            if endpoint == "/v1/models":
+                key = m.id
            else:
-                # Handle single response
-                data["response"]["body"] = _deserialize_response(data["response"]["body"])
+                key = m.model
+            seen[key] = m

-        return cast(dict[str, Any], data)
+    ordered = [seen[k] for k in sorted(seen.keys())]
+    canonical = records[0]
+    canonical_req = canonical.get("request", {})
+    if isinstance(canonical_req, dict):
+        canonical_req["endpoint"] = endpoint
+    if endpoint == "/v1/models":
+        body = {"data": ordered, "object": "list"}
+    else:
+        from ollama import ListResponse
+
+        body = ListResponse(models=ordered)
+    return {"request": canonical_req, "response": {"body": body, "is_streaming": False}}


 async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
@ -195,8 +262,6 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
        raise ValueError(f"Unknown client type: {client_type}")

    url = base_url.rstrip("/") + endpoint
-
-    # Normalize request for matching
    method = "POST"
    headers = {}
    body = kwargs
@ -204,7 +269,12 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
    request_hash = normalize_request(method, url, headers, body)

    if _current_mode == InferenceMode.REPLAY:
-        recording = _current_storage.find_recording(request_hash)
+        # Special handling for model-list endpoints: return union of all responses
+        if endpoint in ("/api/tags", "/v1/models"):
+            records = _current_storage._model_list_responses(request_hash[:12])
+            recording = _combine_model_list_responses(endpoint, records)
+        else:
+            recording = _current_storage.find_recording(request_hash)
        if recording:
            response_body = recording["response"]["body"]

@ -274,12 +344,14 @@ def patch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels

    # Store original methods for both OpenAI and Ollama clients
    _original_methods = {
        "chat_completions_create": AsyncChatCompletions.create,
        "completions_create": AsyncCompletions.create,
        "embeddings_create": AsyncEmbeddings.create,
+        "models_list": AsyncModels.list,
        "ollama_generate": OllamaAsyncClient.generate,
        "ollama_chat": OllamaAsyncClient.chat,
        "ollama_embed": OllamaAsyncClient.embed,
@ -304,10 +376,16 @@ def patch_inference_clients():
            _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
        )

+    async def patched_models_list(self, *args, **kwargs):
+        return await _patched_inference_method(
+            _original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
+        )
+
    # Apply OpenAI patches
    AsyncChatCompletions.create = patched_chat_completions_create
    AsyncCompletions.create = patched_completions_create
    AsyncEmbeddings.create = patched_embeddings_create
+    AsyncModels.list = patched_models_list

    # Create patched methods for Ollama client
    async def patched_ollama_generate(self, *args, **kwargs):
@ -361,11 +439,13 @@ def unpatch_inference_clients():
    from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
    from openai.resources.completions import AsyncCompletions
    from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels

    # Restore OpenAI client methods
    AsyncChatCompletions.create = _original_methods["chat_completions_create"]
    AsyncCompletions.create = _original_methods["completions_create"]
    AsyncEmbeddings.create = _original_methods["embeddings_create"]
+    AsyncModels.list = _original_methods["models_list"]

    # Restore Ollama client methods if they were patched
    OllamaAsyncClient.generate = _original_methods["ollama_generate"]
@ -379,16 +459,10 @@ def unpatch_inference_clients():


@contextmanager
-def inference_recording(mode: str = "live", storage_dir: str | Path | None = None) -> Generator[None, None, None]:
+def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Generator[None, None, None]:
    """Context manager for inference recording/replaying."""
    global _current_mode, _current_storage

-    # Set defaults
-    if storage_dir is None:
-        storage_dir_path = Path.home() / ".llama" / "recordings"
-    else:
-        storage_dir_path = Path(storage_dir)
-
    # Store previous state
    prev_mode = _current_mode
    prev_storage = _current_storage
@ -397,7 +471,9 @@ def inference_recording(mode: str = "live", storage_dir: str | Path | None = Non
        _current_mode = mode

        if mode in ["record", "replay"]:
-            _current_storage = ResponseStorage(storage_dir_path)
+            if storage_dir is None:
+                raise ValueError("storage_dir is required for record and replay modes")
+            _current_storage = ResponseStorage(Path(storage_dir))
            patch_inference_clients()

        yield
--- a/llama_stack/ui/package-lock.json
+++ b/llama_stack/ui/package-lock.json
@ -14,11 +14,11 @@
        "@radix-ui/react-select": "^2.2.5",
        "@radix-ui/react-separator": "^1.1.7",
        "@radix-ui/react-slot": "^1.2.3",
-        "@radix-ui/react-tooltip": "^1.2.6",
+        "@radix-ui/react-tooltip": "^1.2.8",
        "class-variance-authority": "^0.7.1",
        "clsx": "^2.1.1",
-        "framer-motion": "^11.18.2",
-        "llama-stack-client": "^0.2.19",
+        "framer-motion": "^12.23.12",
+        "llama-stack-client": "^0.2.20",
        "lucide-react": "^0.510.0",
        "next": "15.3.3",
        "next-auth": "^4.24.11",
@ -39,16 +39,16 @@
        "@testing-library/jest-dom": "^6.8.0",
        "@testing-library/react": "^16.3.0",
        "@types/jest": "^29.5.14",
-        "@types/node": "^20",
+        "@types/node": "^24",
        "@types/react": "^19",
        "@types/react-dom": "^19",
        "eslint": "^9",
-        "eslint-config-next": "15.3.2",
+        "eslint-config-next": "15.5.2",
        "eslint-config-prettier": "^10.1.8",
        "eslint-plugin-prettier": "^5.5.4",
        "jest": "^29.7.0",
        "jest-environment-jsdom": "^29.7.0",
-        "prettier": "3.5.3",
+        "prettier": "3.6.2",
        "tailwindcss": "^4",
        "ts-node": "^10.9.2",
        "tw-animate-css": "^1.2.9",
@ -1854,9 +1854,9 @@
      "integrity": "sha512-OdiMrzCl2Xi0VTjiQQUK0Xh7bJHnOuET2s+3V+Y40WJBAXrJeGA3f+I8MZJ/YQ3mVGi5XGR1L66oFlgqXhQ4Vw=="
    },
    "node_modules/@next/eslint-plugin-next": {
-      "version": "15.3.2",
-      "resolved": "https://registry.npmjs.org/@next/eslint-plugin-next/-/eslint-plugin-next-15.3.2.tgz",
-      "integrity": "sha512-ijVRTXBgnHT33aWnDtmlG+LJD+5vhc9AKTJPquGG5NKXjpKNjc62woIhFtrAcWdBobt8kqjCoaJ0q6sDQoX7aQ==",
+      "version": "15.5.2",
+      "resolved": "https://registry.npmjs.org/@next/eslint-plugin-next/-/eslint-plugin-next-15.5.2.tgz",
+      "integrity": "sha512-lkLrRVxcftuOsJNhWatf1P2hNVfh98k/omQHrCEPPriUypR6RcS13IvLdIrEvkm9AH2Nu2YpR5vLqBuy6twH3Q==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
@ -2861,29 +2861,6 @@
        }
      }
    },
-    "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-visually-hidden": {
-      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.3.tgz",
-      "integrity": "sha512-pzJq12tEaaIhqjbzpCuv/OypJY/BPavOofm+dbab+MHLajy277+1lLm6JFcGgF5eskJ6mquGirhXY2GD/8u8Ug==",
-      "license": "MIT",
-      "dependencies": {
-        "@radix-ui/react-primitive": "2.1.3"
-      },
-      "peerDependencies": {
-        "@types/react": "*",
-        "@types/react-dom": "*",
-        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
-        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
-      },
-      "peerDependenciesMeta": {
-        "@types/react": {
-          "optional": true
-        },
-        "@types/react-dom": {
-          "optional": true
-        }
-      }
-    },
    "node_modules/@radix-ui/react-separator": {
      "version": "1.1.7",
      "resolved": "https://registry.npmjs.org/@radix-ui/react-separator/-/react-separator-1.1.7.tgz",
@ -2949,23 +2926,23 @@
      }
    },
    "node_modules/@radix-ui/react-tooltip": {
-      "version": "1.2.6",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-tooltip/-/react-tooltip-1.2.6.tgz",
-      "integrity": "sha512-zYb+9dc9tkoN2JjBDIIPLQtk3gGyz8FMKoqYTb8EMVQ5a5hBcdHPECrsZVI4NpPAUOixhkoqg7Hj5ry5USowfA==",
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-tooltip/-/react-tooltip-1.2.8.tgz",
+      "integrity": "sha512-tY7sVt1yL9ozIxvmbtN5qtmH2krXcBCfjEiCgKGLqunJHvgvZG2Pcl2oQ3kbcZARb1BGEHdkLzcYGO8ynVlieg==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/primitive": "1.1.2",
+        "@radix-ui/primitive": "1.1.3",
        "@radix-ui/react-compose-refs": "1.1.2",
        "@radix-ui/react-context": "1.1.2",
-        "@radix-ui/react-dismissable-layer": "1.1.9",
+        "@radix-ui/react-dismissable-layer": "1.1.11",
        "@radix-ui/react-id": "1.1.1",
-        "@radix-ui/react-popper": "1.2.6",
-        "@radix-ui/react-portal": "1.1.8",
-        "@radix-ui/react-presence": "1.1.4",
-        "@radix-ui/react-primitive": "2.1.2",
-        "@radix-ui/react-slot": "1.2.2",
+        "@radix-ui/react-popper": "1.2.8",
+        "@radix-ui/react-portal": "1.1.9",
+        "@radix-ui/react-presence": "1.1.5",
+        "@radix-ui/react-primitive": "2.1.3",
+        "@radix-ui/react-slot": "1.2.3",
        "@radix-ui/react-use-controllable-state": "1.2.2",
-        "@radix-ui/react-visually-hidden": "1.2.2"
+        "@radix-ui/react-visually-hidden": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
@ -2982,21 +2959,162 @@
        }
      }
    },
-    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-slot": {
-      "version": "1.2.2",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.2.tgz",
-      "integrity": "sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==",
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/primitive": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
+      "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
+      "license": "MIT"
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-arrow": {
+      "version": "1.1.7",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
+      "integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/react-compose-refs": "1.1.2"
+        "@radix-ui/react-primitive": "2.1.3"
      },
      "peerDependencies": {
        "@types/react": "*",
-        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
      },
      "peerDependenciesMeta": {
        "@types/react": {
          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-dismissable-layer": {
+      "version": "1.1.11",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
+      "integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/primitive": "1.1.3",
+        "@radix-ui/react-compose-refs": "1.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
+        "@radix-ui/react-use-callback-ref": "1.1.1",
+        "@radix-ui/react-use-escape-keydown": "1.1.1"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-popper": {
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
+      "integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
+      "license": "MIT",
+      "dependencies": {
+        "@floating-ui/react-dom": "^2.0.0",
+        "@radix-ui/react-arrow": "1.1.7",
+        "@radix-ui/react-compose-refs": "1.1.2",
+        "@radix-ui/react-context": "1.1.2",
+        "@radix-ui/react-primitive": "2.1.3",
+        "@radix-ui/react-use-callback-ref": "1.1.1",
+        "@radix-ui/react-use-layout-effect": "1.1.1",
+        "@radix-ui/react-use-rect": "1.1.1",
+        "@radix-ui/react-use-size": "1.1.1",
+        "@radix-ui/rect": "1.1.1"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-portal": {
+      "version": "1.1.9",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
+      "integrity": "sha512-bpIxvq03if6UNwXZ+HTK71JLh4APvnXntDc6XOX8UVq4XQOVl7lwok0AvIl+b8zgCw3fSaVTZMpAPPagXbKmHQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-primitive": "2.1.3",
+        "@radix-ui/react-use-layout-effect": "1.1.1"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-presence": {
+      "version": "1.1.5",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.5.tgz",
+      "integrity": "sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-compose-refs": "1.1.2",
+        "@radix-ui/react-use-layout-effect": "1.1.1"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-primitive": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
+      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-slot": "1.2.3"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
        }
      }
    },
@ -3137,12 +3255,35 @@
      }
    },
    "node_modules/@radix-ui/react-visually-hidden": {
-      "version": "1.2.2",
-      "resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.2.tgz",
-      "integrity": "sha512-ORCmRUbNiZIv6uV5mhFrhsIKw4UX/N3syZtyqvry61tbGm4JlgQuSn0hk5TwCARsCjkcnuRkSdCE3xfb+ADHew==",
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-visually-hidden/-/react-visually-hidden-1.2.3.tgz",
+      "integrity": "sha512-pzJq12tEaaIhqjbzpCuv/OypJY/BPavOofm+dbab+MHLajy277+1lLm6JFcGgF5eskJ6mquGirhXY2GD/8u8Ug==",
      "license": "MIT",
      "dependencies": {
-        "@radix-ui/react-primitive": "2.1.2"
+        "@radix-ui/react-primitive": "2.1.3"
+      },
+      "peerDependencies": {
+        "@types/react": "*",
+        "@types/react-dom": "*",
+        "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
+        "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
+      },
+      "peerDependenciesMeta": {
+        "@types/react": {
+          "optional": true
+        },
+        "@types/react-dom": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/@radix-ui/react-visually-hidden/node_modules/@radix-ui/react-primitive": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
+      "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@radix-ui/react-slot": "1.2.3"
      },
      "peerDependencies": {
        "@types/react": "*",
@ -3910,12 +4051,12 @@
      "license": "MIT"
    },
    "node_modules/@types/node": {
-      "version": "20.17.47",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.17.47.tgz",
-      "integrity": "sha512-3dLX0Upo1v7RvUimvxLeXqwrfyKxUINk0EAM83swP2mlSUcwV73sZy8XhNz8bcZ3VbsfQyC/y6jRdL5tgCNpDQ==",
+      "version": "24.3.0",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-24.3.0.tgz",
+      "integrity": "sha512-aPTXCrfwnDLj4VvXrm+UUCQjNEvJgNA8s5F1cvwQU+3KNltTOkBm1j30uNLyqqPNe7gE3KFzImYoZEfLhp4Yow==",
      "license": "MIT",
      "dependencies": {
-        "undici-types": "~6.19.2"
+        "undici-types": "~7.10.0"
      }
    },
    "node_modules/@types/node-fetch": {
@ -6433,13 +6574,13 @@
      }
    },
    "node_modules/eslint-config-next": {
-      "version": "15.3.2",
-      "resolved": "https://registry.npmjs.org/eslint-config-next/-/eslint-config-next-15.3.2.tgz",
-      "integrity": "sha512-FerU4DYccO4FgeYFFglz0SnaKRe1ejXQrDb8kWUkTAg036YWi+jUsgg4sIGNCDhAsDITsZaL4MzBWKB6f4G1Dg==",
+      "version": "15.5.2",
+      "resolved": "https://registry.npmjs.org/eslint-config-next/-/eslint-config-next-15.5.2.tgz",
+      "integrity": "sha512-3hPZghsLupMxxZ2ggjIIrat/bPniM2yRpsVPVM40rp8ZMzKWOJp2CGWn7+EzoV2ddkUr5fxNfHpF+wU1hGt/3g==",
      "dev": true,
      "license": "MIT",
      "dependencies": {
-        "@next/eslint-plugin-next": "15.3.2",
+        "@next/eslint-plugin-next": "15.5.2",
        "@rushstack/eslint-patch": "^1.10.3",
        "@typescript-eslint/eslint-plugin": "^5.4.2 || ^6.0.0 || ^7.0.0 || ^8.0.0",
        "@typescript-eslint/parser": "^5.4.2 || ^6.0.0 || ^7.0.0 || ^8.0.0",
@ -7268,13 +7409,13 @@
      }
    },
    "node_modules/framer-motion": {
-      "version": "11.18.2",
-      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-11.18.2.tgz",
-      "integrity": "sha512-5F5Och7wrvtLVElIpclDT0CBzMVg3dL22B64aZwHtsIY8RB4mXICLrkajK4G9R+ieSAGcgrLeae2SeUTg2pr6w==",
+      "version": "12.23.12",
+      "resolved": "https://registry.npmjs.org/framer-motion/-/framer-motion-12.23.12.tgz",
+      "integrity": "sha512-6e78rdVtnBvlEVgu6eFEAgG9v3wLnYEboM8I5O5EXvfKC8gxGQB8wXJdhkMy10iVcn05jl6CNw7/HTsTCfwcWg==",
      "license": "MIT",
      "dependencies": {
-        "motion-dom": "^11.18.1",
-        "motion-utils": "^11.18.1",
+        "motion-dom": "^12.23.12",
+        "motion-utils": "^12.23.6",
        "tslib": "^2.4.0"
      },
      "peerDependencies": {
@ -10006,9 +10147,9 @@
      "license": "MIT"
    },
    "node_modules/llama-stack-client": {
-      "version": "0.2.19",
-      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.19.tgz",
-      "integrity": "sha512-sDuAhUdEGlERZ3jlMUzPXcQTgMv/pGbDrPX0ifbE5S+gr7Q+7ohuQYrIXe+hXgIipFjq+y4b2c5laZ76tmAyEA==",
+      "version": "0.2.20",
+      "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.20.tgz",
+      "integrity": "sha512-1vD5nizTX5JEW8TADxKgy/P1W8YZoPSpdnmfxbdYbWgpQ3BWtbvLS6jmDk7VwVA5fRC4895VfHsRDfS1liHarw==",
      "license": "MIT",
      "dependencies": {
        "@types/node": "^18.11.18",
@ -11184,18 +11325,18 @@
      }
    },
    "node_modules/motion-dom": {
-      "version": "11.18.1",
-      "resolved": "https://registry.npmjs.org/motion-dom/-/motion-dom-11.18.1.tgz",
-      "integrity": "sha512-g76KvA001z+atjfxczdRtw/RXOM3OMSdd1f4DL77qCTF/+avrRJiawSG4yDibEQ215sr9kpinSlX2pCTJ9zbhw==",
+      "version": "12.23.12",
+      "resolved": "https://registry.npmjs.org/motion-dom/-/motion-dom-12.23.12.tgz",
+      "integrity": "sha512-RcR4fvMCTESQBD/uKQe49D5RUeDOokkGRmz4ceaJKDBgHYtZtntC/s2vLvY38gqGaytinij/yi3hMcWVcEF5Kw==",
      "license": "MIT",
      "dependencies": {
-        "motion-utils": "^11.18.1"
+        "motion-utils": "^12.23.6"
      }
    },
    "node_modules/motion-utils": {
-      "version": "11.18.1",
-      "resolved": "https://registry.npmjs.org/motion-utils/-/motion-utils-11.18.1.tgz",
-      "integrity": "sha512-49Kt+HKjtbJKLtgO/LKj9Ld+6vw9BjH5d9sc40R/kVyH8GLAXgT42M2NnuPcJNuA3s9ZfZBUcwIgpmZWGEE+hA==",
+      "version": "12.23.6",
+      "resolved": "https://registry.npmjs.org/motion-utils/-/motion-utils-12.23.6.tgz",
+      "integrity": "sha512-eAWoPgr4eFEOFfg2WjIsMoqJTW6Z8MTUCgn/GZ3VRpClWBdnbjryiA3ZSNLyxCTmCQx4RmYX6jX1iWHbenUPNQ==",
      "license": "MIT"
    },
    "node_modules/ms": {
@ -12083,9 +12224,9 @@
      }
    },
    "node_modules/prettier": {
-      "version": "3.5.3",
-      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.5.3.tgz",
-      "integrity": "sha512-QQtaxnoDJeAkDvDKWCLiwIXkTgRhwYDEQCghU9Z6q03iyek/rxRh/2lC3HB7P8sWT2xC/y5JDctPLBIGzHKbhw==",
+      "version": "3.6.2",
+      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.6.2.tgz",
+      "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==",
      "dev": true,
      "license": "MIT",
      "bin": {
@ -13986,9 +14127,9 @@
      }
    },
    "node_modules/undici-types": {
-      "version": "6.19.8",
-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.19.8.tgz",
-      "integrity": "sha512-ve2KP6f/JnbPBFyobGHuerC9g1FYGn/F8n1LWTwNxCEzd6IfqTwUQcNXgEtmmQ6DlRrC1hrSrBnCZPokRrDHjw==",
+      "version": "7.10.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.10.0.tgz",
+      "integrity": "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag==",
      "license": "MIT"
    },
    "node_modules/unified": {
--- a/llama_stack/ui/package.json
+++ b/llama_stack/ui/package.json
@ -19,11 +19,11 @@
    "@radix-ui/react-select": "^2.2.5",
    "@radix-ui/react-separator": "^1.1.7",
    "@radix-ui/react-slot": "^1.2.3",
-    "@radix-ui/react-tooltip": "^1.2.6",
+    "@radix-ui/react-tooltip": "^1.2.8",
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
-    "framer-motion": "^11.18.2",
-    "llama-stack-client": "^0.2.19",
+    "framer-motion": "^12.23.12",
+    "llama-stack-client": "^0.2.20",
    "lucide-react": "^0.510.0",
    "next": "15.3.3",
    "next-auth": "^4.24.11",
@ -44,16 +44,16 @@
    "@testing-library/jest-dom": "^6.8.0",
    "@testing-library/react": "^16.3.0",
    "@types/jest": "^29.5.14",
-    "@types/node": "^20",
+    "@types/node": "^24",
    "@types/react": "^19",
    "@types/react-dom": "^19",
    "eslint": "^9",
-    "eslint-config-next": "15.3.2",
+    "eslint-config-next": "15.5.2",
    "eslint-config-prettier": "^10.1.8",
    "eslint-plugin-prettier": "^5.5.4",
    "jest": "^29.7.0",
    "jest-environment-jsdom": "^29.7.0",
-    "prettier": "3.5.3",
+    "prettier": "3.6.2",
    "tailwindcss": "^4",
    "ts-node": "^10.9.2",
    "tw-animate-css": "^1.2.9",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -7,7 +7,7 @@ required-version = ">=0.7.0"

 [project]
 name = "llama_stack"
-version = "0.2.19"
+version = "0.2.20"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@ -31,9 +31,8 @@ dependencies = [
    "huggingface-hub>=0.34.0,<1.0",
    "jinja2>=3.1.6",
    "jsonschema",
-    "llama-stack-client>=0.2.19",
-    "llama-api-client>=0.1.2",
-    "openai>=1.99.6,<1.100.0",
+    "llama-stack-client>=0.2.20",
+    "openai>=1.99.6",
    "prompt-toolkit",
    "python-dotenv",
    "python-jose[cryptography]",
@ -56,7 +55,7 @@ dependencies = [
 ui = [
    "streamlit",
    "pandas",
-    "llama-stack-client>=0.2.19",
+    "llama-stack-client>=0.2.20",
    "streamlit-option-menu",
 ]

@ -84,6 +83,7 @@ unit = [
    "openai",
    "aiosqlite",
    "aiohttp",
+    "psycopg2-binary>=2.9.0",
    "pypdf",
    "mcp",
    "chardet",
@ -92,7 +92,7 @@ unit = [
    "sqlalchemy[asyncio]>=2.0.41",
    "blobfile",
    "faiss-cpu",
-    "pymilvus>=2.5.12",
+    "pymilvus>=2.6.1",
    "milvus-lite>=2.5.0",
    "litellm",
    "together",
@ -105,12 +105,13 @@ unit = [
 # separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra
 # dependencies.
 test = [
-    "openai",
+    "openai>=1.100.0",  # for expires_after support
    "aiosqlite",
    "aiohttp",
    "torch>=2.6.0",
    "torchvision>=0.21.0",
    "chardet",
+    "psycopg2-binary>=2.9.0",
    "pypdf",
    "mcp",
    "datasets",
@ -119,7 +120,7 @@ test = [
    "sqlalchemy",
    "sqlalchemy[asyncio]>=2.0.41",
    "requests",
-    "pymilvus>=2.5.12",
+    "pymilvus>=2.6.1",
    "milvus-lite>=2.5.0",
    "weaviate-client>=4.16.4",
 ]
@ -144,7 +145,7 @@ docs = [
 ]
 codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
 benchmark = [
-    "locust>=2.37.14",
+    "locust>=2.39.1",
 ]

 [project.urls]
--- a/scripts/github/schedule-record-workflow.sh
+++ b/scripts/github/schedule-record-workflow.sh
@ -15,7 +15,7 @@ set -euo pipefail
 BRANCH=""
 TEST_SUBDIRS=""
 TEST_PROVIDER="ollama"
-RUN_VISION_TESTS=false
+TEST_SUITE="base"
 TEST_PATTERN=""

 # Help function
@ -27,9 +27,9 @@ Trigger the integration test recording workflow remotely. This way you do not ne

 OPTIONS:
    -b, --branch BRANCH         Branch to run the workflow on (defaults to current branch)
-    -s, --test-subdirs DIRS     Comma-separated list of test subdirectories to run (REQUIRED)
    -p, --test-provider PROVIDER Test provider to use: vllm or ollama (default: ollama)
-    -v, --run-vision-tests      Include vision tests in the recording
+    -t, --test-suite SUITE      Test suite to use: base, responses, vision, etc. (default: base)
+    -s, --test-subdirs DIRS     Comma-separated list of test subdirectories to run (overrides suite)
    -k, --test-pattern PATTERN  Regex pattern to pass to pytest -k
    -h, --help                  Show this help message

@ -38,7 +38,7 @@ EXAMPLES:
    $0 --test-subdirs "agents"

    # Record tests for specific branch with vision tests
-    $0 -b my-feature-branch --test-subdirs "inference" --run-vision-tests
+    $0 -b my-feature-branch --test-suite vision

    # Record multiple test subdirectories with specific provider
    $0 --test-subdirs "agents,inference" --test-provider vllm
@ -71,9 +71,9 @@ while [[ $# -gt 0 ]]; do
            TEST_PROVIDER="$2"
            shift 2
            ;;
-        -v|--run-vision-tests)
-            RUN_VISION_TESTS=true
-            shift
+        -t|--test-suite)
+            TEST_SUITE="$2"
+            shift 2
            ;;
        -k|--test-pattern)
            TEST_PATTERN="$2"
@ -92,11 +92,11 @@ while [[ $# -gt 0 ]]; do
 done

 # Validate required parameters
-if [[ -z "$TEST_SUBDIRS" ]]; then
-    echo "Error: --test-subdirs is required"
-    echo "Please specify which test subdirectories to run, e.g.:"
+if [[ -z "$TEST_SUBDIRS" && -z "$TEST_SUITE" ]]; then
+    echo "Error: --test-subdirs or --test-suite is required"
+    echo "Please specify which test subdirectories to run or test suite to use, e.g.:"
    echo "  $0 --test-subdirs \"agents,inference\""
-    echo "  $0 --test-subdirs \"inference\" --run-vision-tests"
+    echo "  $0 --test-suite vision"
    echo ""
    exit 1
 fi
@ -239,17 +239,19 @@ echo "Triggering integration test recording workflow..."
 echo "Branch: $BRANCH"
 echo "Test provider: $TEST_PROVIDER"
 echo "Test subdirs: $TEST_SUBDIRS"
-echo "Run vision tests: $RUN_VISION_TESTS"
+echo "Test suite: $TEST_SUITE"
 echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
 echo ""

 # Prepare inputs for gh workflow run
-INPUTS="-f test-subdirs='$TEST_SUBDIRS'"
+if [[ -n "$TEST_SUBDIRS" ]]; then
+    INPUTS="-f test-subdirs='$TEST_SUBDIRS'"
+fi
 if [[ -n "$TEST_PROVIDER" ]]; then
    INPUTS="$INPUTS -f test-provider='$TEST_PROVIDER'"
 fi
-if [[ "$RUN_VISION_TESTS" == "true" ]]; then
-    INPUTS="$INPUTS -f run-vision-tests=true"
+if [[ -n "$TEST_SUITE" ]]; then
+    INPUTS="$INPUTS -f test-suite='$TEST_SUITE'"
 fi
 if [[ -n "$TEST_PATTERN" ]]; then
    INPUTS="$INPUTS -f test-pattern='$TEST_PATTERN'"
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -16,7 +16,7 @@ STACK_CONFIG=""
 PROVIDER=""
 TEST_SUBDIRS=""
 TEST_PATTERN=""
-RUN_VISION_TESTS="false"
+TEST_SUITE="base"
 INFERENCE_MODE="replay"
 EXTRA_PARAMS=""

@ -28,12 +28,16 @@ Usage: $0 [OPTIONS]
 Options:
    --stack-config STRING    Stack configuration to use (required)
    --provider STRING        Provider to use (ollama, vllm, etc.) (required)
-    --test-subdirs STRING    Comma-separated list of test subdirectories to run (default: 'inference')
-    --run-vision-tests       Run vision tests instead of regular tests
+    --test-suite STRING      Comma-separated list of test suites to run (default: 'base')
    --inference-mode STRING  Inference mode: record or replay (default: replay)
+    --test-subdirs STRING    Comma-separated list of test subdirectories to run (overrides suite)
    --test-pattern STRING    Regex pattern to pass to pytest -k
    --help                   Show this help message

+Suites are defined in tests/integration/suites.py. They are used to narrow the collection of tests and provide default model options.
+
+You can also specify subdirectories (of tests/integration) to select tests from, which will override the suite.
+
 Examples:
    # Basic inference tests with ollama
    $0 --stack-config server:ci-tests --provider ollama
@ -42,7 +46,7 @@ Examples:
    $0 --stack-config server:ci-tests --provider vllm --test-subdirs 'inference,agents'

    # Vision tests with ollama
-    $0 --stack-config server:ci-tests --provider ollama --run-vision-tests
+    $0 --stack-config server:ci-tests --provider ollama --test-suite vision

    # Record mode for updating test recordings
    $0 --stack-config server:ci-tests --provider ollama --inference-mode record
@ -64,9 +68,9 @@ while [[ $# -gt 0 ]]; do
            TEST_SUBDIRS="$2"
            shift 2
            ;;
-        --run-vision-tests)
-            RUN_VISION_TESTS="true"
-            shift
+        --test-suite)
+            TEST_SUITE="$2"
+            shift 2
            ;;
        --inference-mode)
            INFERENCE_MODE="$2"
@ -92,22 +96,25 @@ done
 # Validate required parameters
 if [[ -z "$STACK_CONFIG" ]]; then
    echo "Error: --stack-config is required"
-    usage
    exit 1
 fi

 if [[ -z "$PROVIDER" ]]; then
    echo "Error: --provider is required"
-    usage
+    exit 1
+fi
+
+if [[ -z "$TEST_SUITE" && -z "$TEST_SUBDIRS" ]]; then
+    echo "Error: --test-suite or --test-subdirs is required"
    exit 1
 fi

 echo "=== Llama Stack Integration Test Runner ==="
 echo "Stack Config: $STACK_CONFIG"
 echo "Provider: $PROVIDER"
-echo "Test Subdirs: $TEST_SUBDIRS"
-echo "Vision Tests: $RUN_VISION_TESTS"
 echo "Inference Mode: $INFERENCE_MODE"
+echo "Test Suite: $TEST_SUITE"
+echo "Test Subdirs: $TEST_SUBDIRS"
 echo "Test Pattern: $TEST_PATTERN"
 echo ""

@ -140,13 +147,6 @@ THIS_DIR=$(dirname "$0")
 ROOT_DIR="$THIS_DIR/.."
 cd $ROOT_DIR

-# Set recording directory
-if [[ "$RUN_VISION_TESTS" == "true" ]]; then
-    export LLAMA_STACK_TEST_RECORDING_DIR="tests/integration/recordings/vision"
-else
-    export LLAMA_STACK_TEST_RECORDING_DIR="tests/integration/recordings"
-fi
-
 # check if "llama" and "pytest" are available. this script does not use `uv run` given
 # it can be used in a pre-release environment where we have not been able to tell
 # uv about pre-release dependencies properly (yet).
@ -201,84 +201,46 @@ if [[ -n "$TEST_PATTERN" ]]; then
    PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"
 fi

-# Run vision tests if specified
-if [[ "$RUN_VISION_TESTS" == "true" ]]; then
-    echo "Running vision tests..."
-    set +e
-    pytest -s -v tests/integration/inference/test_vision_inference.py \
-        --stack-config="$STACK_CONFIG" \
-        -k "$PYTEST_PATTERN" \
-        --vision-model=ollama/llama3.2-vision:11b \
-        --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
-        --color=yes $EXTRA_PARAMS \
-        --capture=tee-sys
-    exit_code=$?
-    set -e
-
-    if [ $exit_code -eq 0 ]; then
-        echo "✅ Vision tests completed successfully"
-    elif [ $exit_code -eq 5 ]; then
-        echo "⚠️ No vision tests collected (pattern matched no tests)"
-    else
-        echo "❌ Vision tests failed"
-        exit 1
-    fi
-    exit 0
-fi
-
-# Run regular tests
-if [[ -z "$TEST_SUBDIRS" ]]; then
-   TEST_SUBDIRS=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
-            sed 's|tests/integration/||' |
-            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
-            sort)
-fi
 echo "Test subdirs to run: $TEST_SUBDIRS"

-# Collect all test files for the specified test types
-TEST_FILES=""
-for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do
-    # Skip certain test types for vllm provider
-    if [[ "$PROVIDER" == "vllm" ]]; then
-        if [[ "$test_subdir" == "safety" ]] || [[ "$test_subdir" == "post_training" ]] || [[ "$test_subdir" == "tool_runtime" ]]; then
-            echo "Skipping $test_subdir for vllm provider"
-            continue
+if [[ -n "$TEST_SUBDIRS" ]]; then
+    # Collect all test files for the specified test types
+    TEST_FILES=""
+    for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do
+        if [[ -d "tests/integration/$test_subdir" ]]; then
+            # Find all Python test files in this directory
+            test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py")
+            if [[ -n "$test_files" ]]; then
+                TEST_FILES="$TEST_FILES $test_files"
+                echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files"
+            fi
+        else
+            echo "Warning: Directory tests/integration/$test_subdir does not exist"
        fi
+    done
+
+    if [[ -z "$TEST_FILES" ]]; then
+        echo "No test files found for the specified test types"
+        exit 1
    fi

-    if [[ "$STACK_CONFIG" != *"server:"* ]] && [[ "$test_subdir" == "batches" ]]; then
-        echo "Skipping $test_subdir for library client until types are supported"
-        continue
-    fi
+    echo ""
+    echo "=== Running all collected tests in a single pytest command ==="
+    echo "Total test files: $(echo $TEST_FILES | wc -w)"

-    if [[ -d "tests/integration/$test_subdir" ]]; then
-        # Find all Python test files in this directory
-        test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py")
-        if [[ -n "$test_files" ]]; then
-            TEST_FILES="$TEST_FILES $test_files"
-            echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files"
-        fi
-    else
-        echo "Warning: Directory tests/integration/$test_subdir does not exist"
-    fi
-done
-
-if [[ -z "$TEST_FILES" ]]; then
-    echo "No test files found for the specified test types"
-    exit 1
+    PYTEST_TARGET="$TEST_FILES"
+    EXTRA_PARAMS="$EXTRA_PARAMS --text-model=$TEXT_MODEL --embedding-model=sentence-transformers/all-MiniLM-L6-v2"
+else
+    PYTEST_TARGET="tests/integration/"
+    EXTRA_PARAMS="$EXTRA_PARAMS --suite=$TEST_SUITE"
 fi

-echo ""
-echo "=== Running all collected tests in a single pytest command ==="
-echo "Total test files: $(echo $TEST_FILES | wc -w)"
-
 set +e
-pytest -s -v $TEST_FILES \
+pytest -s -v $PYTEST_TARGET \
    --stack-config="$STACK_CONFIG" \
    -k "$PYTEST_PATTERN" \
-    --text-model="$TEXT_MODEL" \
-    --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
-    --color=yes $EXTRA_PARAMS \
+    $EXTRA_PARAMS \
+    --color=yes \
    --capture=tee-sys
 exit_code=$?
 set -e
@ -298,5 +260,18 @@ echo "=== System Resources After Tests ==="
 free -h 2>/dev/null || echo "free command not available"
 df -h

+# stop server
+if [[ "$STACK_CONFIG" == *"server:"* ]]; then
+    echo "Stopping Llama Stack Server..."
+    pids=$(lsof -i :8321 | awk 'NR>1 {print $2}')
+    if [[ -n "$pids" ]]; then
+        echo "Killing Llama Stack Server processes: $pids"
+        kill -9 $pids
+    else
+        echo "No Llama Stack Server processes found ?!"
+    fi
+    echo "Llama Stack Server stopped"
+fi
+
 echo ""
 echo "=== Integration Tests Complete ==="
--- a/tests/README.md
+++ b/tests/README.md
@ -38,26 +38,15 @@ For running integration tests, you must provide a few things:
  - a distribution name (e.g., `starter`) or a path to a `run.yaml` file
  - a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface.

- Whether you are using replay or live mode for inference. This is specified with the LLAMA_STACK_TEST_INFERENCE_MODE environment variable. The default mode currently is "live" -- that is certainly surprising, but we will fix this soon.
-
 - Any API keys you need to use should be set in the environment, or can be passed in with the --env option.

 You can run the integration tests in replay mode with:
 ```bash
 # Run all tests with existing recordings
-LLAMA_STACK_TEST_INFERENCE_MODE=replay \
-  LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
  uv run --group test \
  pytest -sv tests/integration/ --stack-config=starter
 ```

-If you don't specify LLAMA_STACK_TEST_INFERENCE_MODE, by default it will be in "live" mode -- that is, it will make real API calls.
-
-```bash
-# Test against live APIs
-FIREWORKS_API_KEY=your_key pytest -sv tests/integration/inference --stack-config=starter
-```
-
 ### Re-recording tests

 #### Local Re-recording (Manual Setup Required)
@ -66,7 +55,6 @@ If you want to re-record tests locally, you can do so with:

 ```bash
 LLAMA_STACK_TEST_INFERENCE_MODE=record \
-  LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
  uv run --group test \
  pytest -sv tests/integration/ --stack-config=starter -k "<appropriate test name>"
 ```
@ -89,7 +77,7 @@ You must be careful when re-recording. CI workflows assume a specific setup for
 ./scripts/github/schedule-record-workflow.sh --test-subdirs "agents,inference"

 # Record with vision tests enabled
-./scripts/github/schedule-record-workflow.sh --test-subdirs "inference" --run-vision-tests
+./scripts/github/schedule-record-workflow.sh --test-suite vision

 # Record with specific provider
 ./scripts/github/schedule-record-workflow.sh --test-subdirs "agents" --test-provider vllm
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -42,6 +42,27 @@ Model parameters can be influenced by the following options:
 Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
 if no model is specified.

+### Suites (fast selection + sane defaults)
+
+- `--suite`: comma-separated list of named suites that both narrow which tests are collected and prefill common model options (unless you pass them explicitly).
+- Available suites:
+  - `responses`: collects tests under `tests/integration/responses`; this is a separate suite because it needs a strong tool-calling model.
+  - `vision`: collects only `tests/integration/inference/test_vision_inference.py`; defaults `--vision-model=ollama/llama3.2-vision:11b`, `--embedding-model=sentence-transformers/all-MiniLM-L6-v2`.
+- Explicit flags always win. For example, `--suite=responses --text-model=<X>` overrides the suite’s text model.
+
+Examples:
+
+```bash
+# Fast responses run with defaults
+pytest -s -v tests/integration --stack-config=server:starter --suite=responses
+
+# Fast single-file vision run with defaults
+pytest -s -v tests/integration --stack-config=server:starter --suite=vision
+
+# Combine suites and override a default
+pytest -s -v tests/integration --stack-config=server:starter --suite=responses,vision --embedding-model=text-embedding-3-small
+```
+
 ## Examples

 ### Testing against a Server
@ -98,29 +119,25 @@ sentence-transformers/all-MiniLM-L6-v2

 The testing system supports three modes controlled by environment variables:

-### LIVE Mode (Default)
-Tests make real API calls:
+### REPLAY Mode (Default)
+Uses cached responses instead of making API calls:
 ```bash
-LLAMA_STACK_TEST_INFERENCE_MODE=live pytest tests/integration/
+pytest tests/integration/
 ```
-
 ### RECORD Mode
 Captures API interactions for later replay:
 ```bash
 LLAMA_STACK_TEST_INFERENCE_MODE=record \
-LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
 pytest tests/integration/inference/test_new_feature.py
 ```

-### REPLAY Mode
-Uses cached responses instead of making API calls:
+### LIVE Mode
+Tests make real API calls (but not recorded):
 ```bash
-LLAMA_STACK_TEST_INFERENCE_MODE=replay \
-LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
-pytest tests/integration/
+LLAMA_STACK_TEST_INFERENCE_MODE=live pytest tests/integration/
 ```

-Note that right now you must specify the recording directory. This is because different tests use different recording directories and we don't (yet) have a fool-proof way to map a test to a recording directory. We are working on this.
+By default, the recording directory is `tests/integration/recordings`. You can override this by setting the `LLAMA_STACK_TEST_RECORDING_DIR` environment variable.

 ## Managing Recordings

@ -146,7 +163,6 @@ See the [main testing guide](../README.md#remote-re-recording-recommended) for f
 ```bash
 # Re-record specific tests
 LLAMA_STACK_TEST_INFERENCE_MODE=record \
-LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
 pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py
 ```

--- a/tests/integration/batches/test_batches.py
+++ b/tests/integration/batches/test_batches.py
@ -268,3 +268,58 @@ class TestBatchesIntegration:

        deleted_error_file = openai_client.files.delete(final_batch.error_file_id)
        assert deleted_error_file.deleted, f"Error file {final_batch.error_file_id} was not deleted successfully"
+
+    def test_batch_e2e_completions(self, openai_client, batch_helper, text_model_id):
+        """Run an end-to-end batch with a single successful text completion request."""
+        request_body = {"model": text_model_id, "prompt": "Say completions", "max_tokens": 20}
+
+        batch_requests = [
+            {
+                "custom_id": "success-1",
+                "method": "POST",
+                "url": "/v1/completions",
+                "body": request_body,
+            }
+        ]
+
+        with batch_helper.create_file(batch_requests) as uploaded_file:
+            batch = openai_client.batches.create(
+                input_file_id=uploaded_file.id,
+                endpoint="/v1/completions",
+                completion_window="24h",
+                metadata={"test": "e2e_completions_success"},
+            )
+
+            final_batch = batch_helper.wait_for(
+                batch.id,
+                max_wait_time=3 * 60,
+                expected_statuses={"completed"},
+                timeout_action="skip",
+            )
+
+        assert final_batch.status == "completed"
+        assert final_batch.request_counts is not None
+        assert final_batch.request_counts.total == 1
+        assert final_batch.request_counts.completed == 1
+        assert final_batch.output_file_id is not None
+
+        output_content = openai_client.files.content(final_batch.output_file_id)
+        if isinstance(output_content, str):
+            output_text = output_content
+        else:
+            output_text = output_content.content.decode("utf-8")
+
+        output_lines = output_text.strip().split("\n")
+        assert len(output_lines) == 1
+
+        result = json.loads(output_lines[0])
+        assert result["custom_id"] == "success-1"
+        assert "response" in result
+        assert result["response"]["status_code"] == 200
+
+        deleted_output_file = openai_client.files.delete(final_batch.output_file_id)
+        assert deleted_output_file.deleted
+
+        if final_batch.error_file_id is not None:
+            deleted_error_file = openai_client.files.delete(final_batch.error_file_id)
+            assert deleted_error_file.deleted
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -6,15 +6,17 @@
 import inspect
 import itertools
 import os
-import platform
 import textwrap
 import time
+from pathlib import Path

 import pytest
 from dotenv import load_dotenv

 from llama_stack.log import get_logger

+from .suites import SUITE_DEFINITIONS
+
 logger = get_logger(__name__, category="tests")


@ -30,6 +32,8 @@ def pytest_runtest_makereport(item, call):
 def pytest_sessionstart(session):
    # stop macOS from complaining about duplicate OpenMP libraries
    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+    if "LLAMA_STACK_TEST_INFERENCE_MODE" not in os.environ:
+        os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = "replay"


 def pytest_runtest_teardown(item):
@ -59,9 +63,22 @@ def pytest_configure(config):
        key, value = env_var.split("=", 1)
        os.environ[key] = value

-    if platform.system() == "Darwin":  # Darwin is the system name for macOS
-        os.environ["DISABLE_CODE_SANDBOX"] = "1"
-        logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")
+    suites_raw = config.getoption("--suite")
+    suites: list[str] = []
+    if suites_raw:
+        suites = [p.strip() for p in str(suites_raw).split(",") if p.strip()]
+        unknown = [p for p in suites if p not in SUITE_DEFINITIONS]
+        if unknown:
+            raise pytest.UsageError(
+                f"Unknown suite(s): {', '.join(unknown)}. Available: {', '.join(sorted(SUITE_DEFINITIONS.keys()))}"
+            )
+    for suite in suites:
+        suite_def = SUITE_DEFINITIONS.get(suite, {})
+        defaults: dict = suite_def.get("defaults", {})
+        for dest, value in defaults.items():
+            current = getattr(config.option, dest, None)
+            if not current:
+                setattr(config.option, dest, value)


 def pytest_addoption(parser):
@ -103,16 +120,21 @@ def pytest_addoption(parser):
        default=384,
        help="Output dimensionality of the embedding model to use for testing. Default: 384",
    )
-    parser.addoption(
-        "--record-responses",
-        action="store_true",
-        help="Record new API responses instead of using cached ones.",
-    )
    parser.addoption(
        "--report",
        help="Path where the test report should be written, e.g. --report=/path/to/report.md",
    )

+    available_suites = ", ".join(sorted(SUITE_DEFINITIONS.keys()))
+    suite_help = (
+        "Comma-separated integration test suites to narrow collection and prefill defaults. "
+        "Available: "
+        f"{available_suites}. "
+        "Explicit CLI flags (e.g., --text-model) override suite defaults. "
+        "Examples: --suite=responses or --suite=responses,vision."
+    )
+    parser.addoption("--suite", help=suite_help)
+

 MODEL_SHORT_IDS = {
    "meta-llama/Llama-3.2-3B-Instruct": "3B",
@ -195,3 +217,40 @@ def pytest_generate_tests(metafunc):


 pytest_plugins = ["tests.integration.fixtures.common"]
+
+
+def pytest_ignore_collect(path: str, config: pytest.Config) -> bool:
+    """Skip collecting paths outside the selected suite roots for speed."""
+    suites_raw = config.getoption("--suite")
+    if not suites_raw:
+        return False
+
+    names = [p.strip() for p in str(suites_raw).split(",") if p.strip()]
+    roots: list[str] = []
+    for name in names:
+        suite_def = SUITE_DEFINITIONS.get(name)
+        if suite_def:
+            roots.extend(suite_def.get("roots", []))
+    if not roots:
+        return False
+
+    p = Path(str(path)).resolve()
+
+    # Only constrain within tests/integration to avoid ignoring unrelated tests
+    integration_root = (Path(str(config.rootpath)) / "tests" / "integration").resolve()
+    if not p.is_relative_to(integration_root):
+        return False
+
+    for r in roots:
+        rp = (Path(str(config.rootpath)) / r).resolve()
+        if rp.is_file():
+            # Allow the exact file and any ancestor directories so pytest can walk into it.
+            if p == rp:
+                return False
+            if p.is_dir() and rp.is_relative_to(p):
+                return False
+        else:
+            # Allow anything inside an allowed directory
+            if p.is_relative_to(rp):
+                return False
+    return True
--- a/tests/integration/files/test_files.py
+++ b/tests/integration/files/test_files.py
@ -8,6 +8,7 @@ from io import BytesIO
 from unittest.mock import patch

 import pytest
+import requests

 from llama_stack.core.datatypes import User

@ -79,6 +80,88 @@ def test_openai_client_basic_operations(openai_client):
                pass  # ignore 404


+@pytest.mark.xfail(message="expires_after not available on all providers")
+def test_expires_after(openai_client):
+    """Test uploading a file with expires_after parameter."""
+    client = openai_client
+
+    uploaded_file = None
+    try:
+        with BytesIO(b"expires_after test") as file_buffer:
+            file_buffer.name = "expires_after.txt"
+            uploaded_file = client.files.create(
+                file=file_buffer,
+                purpose="assistants",
+                expires_after={"anchor": "created_at", "seconds": 4545},
+            )
+
+        assert uploaded_file.expires_at is not None
+        assert uploaded_file.expires_at == uploaded_file.created_at + 4545
+
+        listed = client.files.list()
+        ids = [f.id for f in listed.data]
+        assert uploaded_file.id in ids
+
+        retrieved = client.files.retrieve(uploaded_file.id)
+        assert retrieved.id == uploaded_file.id
+
+    finally:
+        if uploaded_file is not None:
+            try:
+                client.files.delete(uploaded_file.id)
+            except Exception:
+                pass
+
+
+@pytest.mark.xfail(message="expires_after not available on all providers")
+def test_expires_after_requests(openai_client):
+    """Upload a file using requests multipart/form-data and bracketed expires_after fields.
+
+    This ensures clients that send form fields like `expires_after[anchor]` and
+    `expires_after[seconds]` are handled by the server.
+    """
+    base_url = f"{openai_client.base_url}files"
+
+    uploaded_id = None
+    try:
+        files = {"file": ("expires_after_with_requests.txt", BytesIO(b"expires_after via requests"))}
+        data = {
+            "purpose": "assistants",
+            "expires_after[anchor]": "created_at",
+            "expires_after[seconds]": "4545",
+        }
+
+        session = requests.Session()
+        request = requests.Request("POST", base_url, files=files, data=data)
+        prepared = session.prepare_request(request)
+        resp = session.send(prepared, timeout=30)
+        resp.raise_for_status()
+        result = resp.json()
+
+        assert result.get("id", "").startswith("file-")
+        uploaded_id = result["id"]
+        assert result.get("created_at") is not None
+        assert result.get("expires_at") == result["created_at"] + 4545
+
+        list_resp = requests.get(base_url, timeout=30)
+        list_resp.raise_for_status()
+        listed = list_resp.json()
+        ids = [f["id"] for f in listed.get("data", [])]
+        assert uploaded_id in ids
+
+        retrieve_resp = requests.get(f"{base_url}/{uploaded_id}", timeout=30)
+        retrieve_resp.raise_for_status()
+        retrieved = retrieve_resp.json()
+        assert retrieved["id"] == uploaded_id
+
+    finally:
+        if uploaded_id:
+            try:
+                requests.delete(f"{base_url}/{uploaded_id}", timeout=30)
+            except Exception:
+                pass
+
+
@pytest.mark.xfail(message="User isolation broken for current providers, must be fixed.")
@patch("llama_stack.providers.utils.sqlstore.authorized_sqlstore.get_authenticated_user")
 def test_files_authentication_isolation(mock_get_authenticated_user, llama_stack_client):
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@ -5,6 +5,8 @@
 # the root directory of this source tree.


+import time
+
 import pytest

 from ..test_cases.test_case import TestCase
@ -35,6 +37,10 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
        "remote::sambanova",
        "remote::tgi",
        "remote::vertexai",
+        # {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos,
+        # or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}
+        "remote::groq",
+        "remote::gemini",  # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
    ):
        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")

@ -56,6 +62,21 @@ def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
        pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")


+def skip_if_doesnt_support_n(client_with_models, model_id):
+    provider = provider_from_model(client_with_models, model_id)
+    if provider.provider_type in (
+        "remote::sambanova",
+        "remote::ollama",
+        # https://console.groq.com/docs/openai#currently-unsupported-openai-features
+        # -> Error code: 400 - {'error': {'message': "'n' : number must be at most 1", 'type': 'invalid_request_error'}}
+        "remote::groq",
+        # Error code: 400 - [{'error': {'code': 400, 'message': 'Only one candidate can be specified in the
+        # current model', 'status': 'INVALID_ARGUMENT'}}]
+        "remote::gemini",
+    ):
+        pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")
+
+
 def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
    provider = provider_from_model(client_with_models, model_id)
    if provider.provider_type in (
@ -260,10 +281,7 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
 )
 def test_openai_chat_completion_streaming_with_n(compat_client, client_with_models, text_model_id, test_case):
    skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
-
-    provider = provider_from_model(client_with_models, text_model_id)
-    if provider.provider_type == "remote::ollama":
-        pytest.skip(f"Model {text_model_id} hosted by {provider.provider_type} doesn't support n > 1.")
+    skip_if_doesnt_support_n(client_with_models, text_model_id)

    tc = TestCase(test_case)
    question = tc["question"]
@ -323,8 +341,15 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
        response_id = response.id
        content = response.choices[0].message.content

-    responses = client.chat.completions.list(limit=1000)
-    assert response_id in [r.id for r in responses.data]
+    tries = 0
+    while tries < 10:
+        responses = client.chat.completions.list(limit=1000)
+        if response_id in [r.id for r in responses.data]:
+            break
+        else:
+            tries += 1
+            time.sleep(0.1)
+    assert tries < 10, f"Response {response_id} not found after 1 second"

    retrieved_response = client.chat.completions.retrieve(response_id)
    assert retrieved_response.id == response_id
@ -388,6 +413,18 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
        response_id = response.id
        content = response.choices[0].message.content

+    # wait for the response to be stored
+    tries = 0
+    while tries < 10:
+        responses = client.chat.completions.list(limit=1000)
+        if response_id in [r.id for r in responses.data]:
+            break
+        else:
+            tries += 1
+            time.sleep(0.1)
+
+    assert tries < 10, f"Response {response_id} not found after 1 second"
+
    responses = client.chat.completions.list(limit=1000)
    assert response_id in [r.id for r in responses.data]

--- a/tests/integration/non_ci/responses/fixtures/init.py
+++ b/tests/integration/non_ci/responses/fixtures/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/tests/integration/recordings/responses/00ba04f74a96.json
+++ b/tests/integration/recordings/responses/00ba04f74a96.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:53.860911Z",
+        "created_at": "2025-09-03T17:37:35.23084Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 249137667,
-        "load_duration": 152509542,
+        "total_duration": 195981375,
+        "load_duration": 110522917,
        "prompt_eval_count": 216,
-        "prompt_eval_duration": 71000000,
+        "prompt_eval_duration": 72393958,
        "eval_count": 2,
-        "eval_duration": 24000000,
+        "eval_duration": 11843000,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/04172112ffbb.json
+++ b/tests/integration/recordings/responses/04172112ffbb.json
@ -21,7 +21,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.033900164Z",
+          "created_at": "2025-09-03T17:41:43.950283Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -39,7 +39,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.213371151Z",
+          "created_at": "2025-09-03T17:41:43.991122Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -57,7 +57,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.387513976Z",
+          "created_at": "2025-09-03T17:41:44.031378Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -75,7 +75,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.564344287Z",
+          "created_at": "2025-09-03T17:41:44.073098Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -93,7 +93,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.746579415Z",
+          "created_at": "2025-09-03T17:41:44.115961Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -111,7 +111,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:18.923276047Z",
+          "created_at": "2025-09-03T17:41:44.156517Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -129,7 +129,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.099961963Z",
+          "created_at": "2025-09-03T17:41:44.197079Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -147,7 +147,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.275621884Z",
+          "created_at": "2025-09-03T17:41:44.237565Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -165,7 +165,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.452204196Z",
+          "created_at": "2025-09-03T17:41:44.277755Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -183,7 +183,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.626937514Z",
+          "created_at": "2025-09-03T17:41:44.318476Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -201,7 +201,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.805566767Z",
+          "created_at": "2025-09-03T17:41:44.358628Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -219,7 +219,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:19.985987477Z",
+          "created_at": "2025-09-03T17:41:44.398984Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -237,7 +237,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.166458601Z",
+          "created_at": "2025-09-03T17:41:44.439232Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -255,7 +255,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.343346795Z",
+          "created_at": "2025-09-03T17:41:44.479478Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -273,7 +273,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.525008091Z",
+          "created_at": "2025-09-03T17:41:44.520202Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -291,7 +291,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.709087695Z",
+          "created_at": "2025-09-03T17:41:44.560517Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -309,7 +309,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:20.887074305Z",
+          "created_at": "2025-09-03T17:41:44.601592Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -327,15 +327,15 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-07-31T17:59:21.065244925Z",
+          "created_at": "2025-09-03T17:41:44.642064Z",
          "done": true,
          "done_reason": "stop",
-          "total_duration": 4373531496,
-          "load_duration": 44438132,
+          "total_duration": 887142667,
+          "load_duration": 119331417,
          "prompt_eval_count": 56,
-          "prompt_eval_duration": 1296273199,
+          "prompt_eval_duration": 74294709,
          "eval_count": 18,
-          "eval_duration": 3032321735,
+          "eval_duration": 692842791,
          "response": "",
          "thinking": null,
          "context": null
--- a/tests/integration/recordings/responses/0b27fd737699.json
+++ b/tests/integration/recordings/responses/0b27fd737699.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:13:57.556416Z",
+        "created_at": "2025-09-03T17:37:47.461886Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 432363250,
-        "load_duration": 159296417,
+        "total_duration": 338927833,
+        "load_duration": 100895125,
        "prompt_eval_count": 223,
-        "prompt_eval_duration": 257000000,
+        "prompt_eval_duration": 221583042,
        "eval_count": 2,
-        "eval_duration": 14000000,
+        "eval_duration": 12341416,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/0b3f2e4754ff.json
+++ b/tests/integration/recordings/responses/0b3f2e4754ff.json
@ -24,7 +24,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -39,7 +39,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -50,7 +50,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -65,7 +65,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -76,7 +76,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -91,7 +91,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -102,7 +102,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -117,7 +117,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921333,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -128,7 +128,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -143,7 +143,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -154,7 +154,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -169,7 +169,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -180,7 +180,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -195,7 +195,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -206,7 +206,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-29",
+          "id": "chatcmpl-414",
          "choices": [
            {
              "delta": {
@ -221,7 +221,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090031,
+          "created": 1756921334,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
--- a/tests/integration/recordings/responses/0e8f2b001dd9.json
+++ b/tests/integration/recordings/responses/0e8f2b001dd9.json
@ -1,7 +1,7 @@
 {
  "request": {
    "method": "POST",
-    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
@ -20,14 +20,14 @@
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
-        "id": "chatcmpl-368",
+        "id": "chatcmpl-161",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
-              "content": "Saturn is known for its extensive ring system.",
+              "content": "The answer is Saturn.",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
@ -37,15 +37,15 @@
            }
          }
        ],
-        "created": 1754081853,
+        "created": 1756921364,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
-          "completion_tokens": 11,
+          "completion_tokens": 6,
          "prompt_tokens": 39,
-          "total_tokens": 50,
+          "total_tokens": 45,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
--- a/tests/integration/recordings/responses/10eea8c15ddc.json
+++ b/tests/integration/recordings/responses/10eea8c15ddc.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:51.682357Z",
+        "created_at": "2025-09-03T17:37:33.473237Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 238161000,
-        "load_duration": 72494750,
+        "total_duration": 279025042,
+        "load_duration": 162673250,
        "prompt_eval_count": 212,
-        "prompt_eval_duration": 87000000,
+        "prompt_eval_duration": 73595834,
        "eval_count": 5,
-        "eval_duration": 74000000,
+        "eval_duration": 41950291,
        "response": "unsafe\nS8",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/140187e305dc.json
+++ b/tests/integration/recordings/responses/140187e305dc.json
@ -1,7 +1,7 @@
 {
  "request": {
    "method": "POST",
-    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
@ -20,14 +20,14 @@
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
-        "id": "chatcmpl-876",
+        "id": "chatcmpl-974",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
-              "content": "I'm afraid I don't have a built-in ability to directly interface with or \"test\" OpenAI models, including the original GPT-1 model. However, I can explain how you might approach this task:\n\nThe OpenAI GPT-1 is a large transformer-based language model that was trained on a massive dataset of text and achieved state-of-the-art results in various natural language processing tasks.\n\nTo test or evaluate the performance of a model like GPT-1, you would typically follow these steps:\n\n1. **Get access to the OpenAI API**: The OpenAI API provides a way for developers to interact with the GPT-1 model programmatically. You can sign up for an API key on the OpenAI website.\n2. **Choose a testing platform or environment**: You'll need a compute platform that supports the necessary algorithms and data structures to run inference on the GPT-1 model. Some popular options include AWS, Google Cloud, or Azure Compute Virtual Machines.\n3. **Prepare your test input data**: This will involve creating text inputs in the format expected by the OpenAI API (i.e., a JSON object containing the text to be processed).\n4. **Use the OpenAI Python library or SDK**: The OpenAI Python library provides an easy-to-use interface for interacting with the GPT-1 model through the API.\n\nHere's some example code that demonstrates how you might use the OpenAI Flask API to test a single input:\n\n```python\nfrom flask import Flask, request, jsonify\nimport json\n\napp = Flask(__name__)\n\n@ app . route ( '/ /gpt-en ', ' Text ', methods = ['POST'])\ndef gpt_en () -> Json :\n    data = request . get_json ()\n    if not data or \"message\" in ( data ):\n        return None , 400 , { ' error' : \"Input must be a text string.\" }\n    response = []\n    while True:\n        message = \"\"\n        for token in data [\"input\"]:\n            response_text = f\"{data['prompt']} {token}\"\n            data[\"input\"] = [response_text]\n            new_response = gpt_en()(data)\n            if all([not item or not isinstance(item, dict) for item in new_response]):\n             break\n\n        message = json . dumps ({}\"text\": response_text})\n        response.append(message)\n\n    return jsonify ({\"output\": response}), 200 , {}\n\nif __name__ == \"__main__\":\n   app.run(debug=True)\n```\n\n5. **Evaluate the output**: Once you have processed your test input data using the GPT-1 model, you can evaluate the accuracy of the generated responses.\n\nKeep in mind that this is just a basic example to illustrate how you might approach testing the OpenAI GPT-1 model.",
+              "content": "I'm happy to help you test the OpenAI API, however I can not access the API.\n\nInstead why don't we follow these steps:\n\n*   Check documentation\n*   Contact support\n*   Reach out to their community forum. \n\nLet me know if I can be of any additional assistance",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
@ -37,15 +37,15 @@
            }
          }
        ],
-        "created": 1754510050,
+        "created": 1756921202,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
-          "completion_tokens": 567,
+          "completion_tokens": 61,
          "prompt_tokens": 31,
-          "total_tokens": 598,
+          "total_tokens": 92,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
--- a/tests/integration/recordings/responses/17253d7cc667.json
+++ b/tests/integration/recordings/responses/17253d7cc667.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:52.919624Z",
+        "created_at": "2025-09-03T17:37:34.308033Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 201956834,
-        "load_duration": 105132584,
+        "total_duration": 200296000,
+        "load_duration": 115974708,
        "prompt_eval_count": 212,
-        "prompt_eval_duration": 75000000,
+        "prompt_eval_duration": 72173459,
        "eval_count": 2,
-        "eval_duration": 20000000,
+        "eval_duration": 11536750,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/173ecb3aab28.json
+++ b/tests/integration/recordings/responses/173ecb3aab28.json
@ -40,7 +40,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -55,7 +55,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -66,7 +66,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -81,7 +81,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -92,7 +92,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -107,7 +107,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -118,7 +118,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -133,7 +133,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -144,7 +144,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -159,7 +159,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -170,7 +170,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -185,7 +185,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -196,7 +196,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -211,7 +211,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
@ -222,7 +222,7 @@
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
-          "id": "chatcmpl-457",
+          "id": "chatcmpl-921",
          "choices": [
            {
              "delta": {
@ -237,7 +237,7 @@
              "logprobs": null
            }
          ],
-          "created": 1754090032,
+          "created": 1756920971,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
--- a/tests/integration/recordings/responses/174458ad71b2.json
+++ b/tests/integration/recordings/responses/174458ad71b2.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:53.580806Z",
+        "created_at": "2025-09-03T17:37:34.994704Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 205732750,
-        "load_duration": 98967000,
+        "total_duration": 339570875,
+        "load_duration": 262794125,
        "prompt_eval_count": 213,
-        "prompt_eval_duration": 86000000,
+        "prompt_eval_duration": 64061000,
        "eval_count": 2,
-        "eval_duration": 18000000,
+        "eval_duration": 11839042,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/178016edef0e.json
+++ b/tests/integration/recordings/responses/178016edef0e.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:52.354566Z",
+        "created_at": "2025-09-03T17:37:33.769233Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 605192500,
-        "load_duration": 457087166,
+        "total_duration": 253836584,
+        "load_duration": 138624959,
        "prompt_eval_count": 210,
-        "prompt_eval_duration": 63000000,
+        "prompt_eval_duration": 69496125,
        "eval_count": 5,
-        "eval_duration": 84000000,
+        "eval_duration": 45062833,
        "response": "unsafe\nS12",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/197228e26971.json
+++ b/tests/integration/recordings/responses/197228e26971.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:52.686478Z",
+        "created_at": "2025-09-03T17:37:34.074233Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 304136208,
-        "load_duration": 155977000,
+        "total_duration": 270746375,
+        "load_duration": 156423042,
        "prompt_eval_count": 213,
-        "prompt_eval_duration": 71000000,
+        "prompt_eval_duration": 70338083,
        "eval_count": 5,
-        "eval_duration": 76000000,
+        "eval_duration": 43379167,
        "response": "unsafe\nS2",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/198ef7208389.json
+++ b/tests/integration/recordings/responses/198ef7208389.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:51.186501Z",
+        "created_at": "2025-09-03T17:37:32.84197Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 3146184459,
-        "load_duration": 2533467917,
+        "total_duration": 21572898667,
+        "load_duration": 21155275042,
        "prompt_eval_count": 212,
-        "prompt_eval_duration": 526000000,
+        "prompt_eval_duration": 371898125,
        "eval_count": 5,
-        "eval_duration": 83000000,
+        "eval_duration": 43290458,
        "response": "unsafe\nS1",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/1adfaa0e062e.json
+++ b/tests/integration/recordings/responses/1adfaa0e062e.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama-guard3:1b",
-        "created_at": "2025-08-01T23:12:53.332041Z",
+        "created_at": "2025-09-03T17:37:34.607413Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 365895333,
-        "load_duration": 257825208,
+        "total_duration": 267812042,
+        "load_duration": 181570000,
        "prompt_eval_count": 213,
-        "prompt_eval_duration": 78000000,
+        "prompt_eval_duration": 73947375,
        "eval_count": 2,
-        "eval_duration": 28000000,
+        "eval_duration": 11708000,
        "response": "safe",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/1b8394f90636.json
+++ b/tests/integration/recordings/responses/1b8394f90636.json
@ -22,15 +22,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama3.2:3b-instruct-fp16",
-        "created_at": "2025-08-04T22:55:05.685988Z",
+        "created_at": "2025-09-03T17:36:13.821929Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 14128980625,
-        "load_duration": 7220159208,
+        "total_duration": 1907912167,
+        "load_duration": 90979292,
        "prompt_eval_count": 18,
-        "prompt_eval_duration": 4658000000,
+        "prompt_eval_duration": 77350291,
        "eval_count": 43,
-        "eval_duration": 2224000000,
+        "eval_duration": 1738568334,
        "response": " _______.\n\nThe best answer is blue. The traditional nursery rhyme goes like this:\n\nRoses are red,\nViolets are blue,\nSugar is sweet,\nAnd so are you! (Or something similar.)",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/1b92be674e2a.json
+++ b/tests/integration/recordings/responses/1b92be674e2a.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama3.2:3b-instruct-fp16",
-        "created_at": "2025-07-31T17:50:06.140190726Z",
+        "created_at": "2025-09-03T17:39:38.236797Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 5213341378,
-        "load_duration": 43943569,
+        "total_duration": 1296281500,
+        "load_duration": 283393917,
        "prompt_eval_count": 23,
-        "prompt_eval_duration": 1049424427,
+        "prompt_eval_duration": 75453042,
        "eval_count": 24,
-        "eval_duration": 4119422888,
+        "eval_duration": 936860125,
        "response": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/1e11c2b20ff8.json
+++ b/tests/integration/recordings/responses/1e11c2b20ff8.json
@ -0,0 +1,422 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/embeddings",
+    "headers": {},
+    "body": {
+      "model": "all-minilm:l6-v2",
+      "input": [
+        "How do systems learn automatically?"
+      ],
+      "encoding_format": "float"
+    },
+    "endpoint": "/v1/embeddings",
+    "model": "all-minilm:l6-v2"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
+      "__data__": {
+        "data": [
+          {
+            "embedding": [
+              0.042460807,
+              -0.06189971,
+              -0.0784711,
+              0.0064329687,
+              0.03129365,
+              0.00807445,
+              0.05801836,
+              0.025447326,
+              0.016402787,
+              0.045995634,
+              -0.028924342,
+              0.04451832,
+              0.05686613,
+              -0.015340794,
+              -0.07020505,
+              -0.057178136,
+              -0.07683263,
+              0.006748679,
+              0.0043323045,
+              -0.123651944,
+              0.0031534543,
+              -0.03258051,
+              -0.02936216,
+              0.024140852,
+              -0.028559243,
+              0.10224467,
+              0.0021632623,
+              -0.006975691,
+              0.025292527,
+              -0.055500276,
+              0.031231727,
+              -0.0070274337,
+              0.08430815,
+              -0.028431177,
+              -0.083029,
+              0.009555893,
+              -0.020029299,
+              -0.00243229,
+              -0.00768719,
+              -0.023077851,
+              -0.09293533,
+              -0.042625993,
+              -0.020000124,
+              0.008240663,
+              0.060970567,
+              0.050315727,
+              -0.0510085,
+              -0.008543903,
+              -0.030227834,
+              -0.03582846,
+              -0.17836656,
+              -0.047279052,
+              0.033892106,
+              0.031623542,
+              -0.008832113,
+              0.10480918,
+              0.033559043,
+              0.090348184,
+              -0.015757555,
+              -0.0125672715,
+              -0.084686965,
+              -0.114781834,
+              -0.13755985,
+              0.021652374,
+              0.047834594,
+              0.043243896,
+              0.008659893,
+              0.038724966,
+              0.046716973,
+              -0.077413626,
+              -0.04887495,
+              0.031287406,
+              0.022356613,
+              0.00043283988,
+              0.052321073,
+              -0.012254071,
+              -0.035172574,
+              -0.00825216,
+              -0.008866574,
+              -0.034267236,
+              -0.04576201,
+              0.002467568,
+              -0.040877618,
+              0.08047682,
+              0.09472728,
+              0.0413438,
+              0.0057974122,
+              0.044982508,
+              0.025369909,
+              0.006618073,
+              0.010467276,
+              -0.07960384,
+              -0.03108485,
+              -0.03528749,
+              0.01831391,
+              0.053473305,
+              0.06568304,
+              -0.07259002,
+              0.02523736,
+              0.10520362,
+              0.035732146,
+              0.028157586,
+              0.011687256,
+              0.044207197,
+              0.012604437,
+              0.0018819098,
+              0.03926183,
+              0.043135095,
+              0.09784739,
+              -0.08801336,
+              -0.06060836,
+              0.02681984,
+              0.0041358666,
+              0.033492945,
+              0.011799116,
+              0.009551661,
+              -0.0095491735,
+              -0.021212189,
+              -0.008917248,
+              0.029352615,
+              -0.012693442,
+              -0.019269384,
+              0.009901157,
+              -0.00812101,
+              0.018603146,
+              -0.0007501193,
+              -0.056115113,
+              -3.8018077e-33,
+              0.020848714,
+              0.0047160466,
+              0.019726405,
+              0.06024251,
+              -0.0685974,
+              -0.07497267,
+              0.007997452,
+              -0.047339544,
+              0.057801835,
+              0.049544968,
+              0.01878086,
+              0.03274472,
+              0.017663997,
+              0.07483022,
+              0.02496901,
+              -0.011843339,
+              -0.11212756,
+              0.0070379525,
+              0.028099466,
+              -0.01746246,
+              0.08173482,
+              -0.007920462,
+              0.032095373,
+              -0.12300146,
+              0.033773854,
+              0.025873141,
+              -0.0045020077,
+              0.079493225,
+              0.0040725255,
+              0.03305898,
+              0.008061117,
+              0.0134422695,
+              -0.03292251,
+              0.031554114,
+              0.04013794,
+              0.0014983519,
+              0.030762345,
+              0.029481992,
+              0.041350223,
+              -0.047438618,
+              0.03944708,
+              -0.07526981,
+              0.037927423,
+              -0.026016014,
+              0.016933467,
+              0.0136799775,
+              0.0071263947,
+              -0.05386736,
+              -0.07443268,
+              -0.006070775,
+              0.024427462,
+              -0.039844982,
+              -0.020661902,
+              -0.033354662,
+              0.009005565,
+              0.12111172,
+              -0.028260944,
+              -0.036192853,
+              -0.021332363,
+              0.05333571,
+              0.05161245,
+              -0.01204843,
+              0.035563566,
+              0.05408247,
+              0.060722187,
+              0.07159865,
+              0.04299143,
+              0.008544481,
+              0.07421879,
+              0.00841512,
+              -0.036342908,
+              -0.008549791,
+              -0.08816386,
+              -0.049075164,
+              0.00029373015,
+              -0.05127952,
+              0.03586739,
+              -0.030380003,
+              -0.012642127,
+              0.018771531,
+              0.01711824,
+              -0.06644723,
+              0.023793438,
+              0.0010271219,
+              -0.01939443,
+              -0.053452212,
+              -0.017060323,
+              -0.062207118,
+              -0.05962535,
+              -0.012172617,
+              -0.013190802,
+              -0.037036054,
+              0.00082622556,
+              0.098088354,
+              0.024690514,
+              2.1767905e-33,
+              -0.010088812,
+              -0.016811697,
+              -0.042140447,
+              0.08837209,
+              -0.028899776,
+              -0.0048947735,
+              -0.082139015,
+              0.029238816,
+              -0.043079354,
+              -0.014153092,
+              -0.028387645,
+              0.025998218,
+              -0.017625,
+              0.046511114,
+              -0.005768211,
+              0.030010609,
+              0.011375536,
+              0.017426634,
+              0.055062976,
+              0.032230247,
+              -0.07995765,
+              0.032486655,
+              -0.060016844,
+              -0.011561194,
+              0.010211269,
+              0.046528235,
+              0.001191399,
+              0.0786961,
+              -0.0446158,
+              0.032789085,
+              0.0023115936,
+              -0.03886269,
+              -0.017663589,
+              0.07913024,
+              -0.004583343,
+              0.043521065,
+              -0.031589273,
+              0.008867868,
+              -0.05013296,
+              0.068929516,
+              0.043675046,
+              0.019968731,
+              -0.08471742,
+              -0.046864275,
+              -0.0068198936,
+              -0.026138468,
+              -0.05107216,
+              0.054374695,
+              0.03069186,
+              -0.010925094,
+              0.04721093,
+              -0.017387696,
+              -0.020754937,
+              -0.081763394,
+              -0.027709637,
+              0.035980806,
+              0.05396534,
+              0.044874854,
+              0.059699643,
+              0.041227758,
+              -0.06664364,
+              -0.09201654,
+              0.008915574,
+              0.025849758,
+              -0.038651932,
+              -0.0044070315,
+              -0.052066546,
+              0.027435115,
+              0.012089562,
+              0.048306923,
+              0.059854515,
+              0.097325735,
+              -0.053612895,
+              -0.07639326,
+              0.015773866,
+              -0.0444848,
+              -0.13214406,
+              -0.0702488,
+              -0.10134438,
+              -0.11905995,
+              -0.027714504,
+              0.006891868,
+              -0.0053650527,
+              0.054135524,
+              -0.111159205,
+              0.07835098,
+              0.03506018,
+              0.016036613,
+              0.021490784,
+              -0.061526407,
+              0.007425222,
+              0.04833579,
+              -0.01361202,
+              0.012450488,
+              -0.12729599,
+              -1.4009424e-08,
+              -0.040908325,
+              -0.01596458,
+              0.060048707,
+              0.03804525,
+              0.0663794,
+              0.04727275,
+              -0.016112225,
+              0.09687414,
+              -0.04424251,
+              -0.028799534,
+              -0.01294642,
+              0.013026413,
+              0.022404836,
+              0.04713173,
+              0.06402557,
+              0.12130648,
+              0.06062839,
+              0.10218965,
+              -0.0757528,
+              -0.023806982,
+              0.12489501,
+              -0.045460615,
+              0.09545599,
+              0.021262301,
+              0.03731495,
+              -0.075220875,
+              -0.0026194793,
+              0.0472452,
+              0.048499025,
+              0.12358729,
+              0.017998053,
+              0.013811017,
+              -0.035893846,
+              -0.051789004,
+              0.06182457,
+              0.05160056,
+              0.008895317,
+              -0.12500942,
+              0.016453298,
+              -0.08590811,
+              -0.071096726,
+              0.06987216,
+              -0.036072273,
+              -0.0053715096,
+              -0.048762616,
+              0.00081640907,
+              -0.021502526,
+              -0.061078615,
+              0.002485032,
+              -0.032720752,
+              0.045743283,
+              0.038934175,
+              -0.024666062,
+              0.025897244,
+              0.10301431,
+              -0.013001504,
+              0.04783332,
+              -0.07114252,
+              0.046031926,
+              0.080549754,
+              -0.10302451,
+              0.08449227,
+              0.028010191,
+              -0.03697792
+            ],
+            "index": 0,
+            "object": "embedding"
+          }
+        ],
+        "model": "all-minilm:l6-v2",
+        "object": "list",
+        "usage": {
+          "prompt_tokens": 6,
+          "total_tokens": 6
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/211b1562d4e6.json
+++ b/tests/integration/recordings/responses/211b1562d4e6.json
@ -20,15 +20,15 @@
      "__type__": "ollama._types.GenerateResponse",
      "__data__": {
        "model": "llama3.2:3b-instruct-fp16",
-        "created_at": "2025-08-04T22:55:11.15982Z",
+        "created_at": "2025-09-03T17:36:17.894986Z",
        "done": true,
        "done_reason": "stop",
-        "total_duration": 498612042,
-        "load_duration": 71411834,
+        "total_duration": 363397458,
+        "load_duration": 86692791,
        "prompt_eval_count": 23,
-        "prompt_eval_duration": 102000000,
+        "prompt_eval_duration": 68658541,
        "eval_count": 6,
-        "eval_duration": 323000000,
+        "eval_duration": 207389084,
        "response": "Humans live on Earth.",
        "thinking": null,
        "context": null
--- a/tests/integration/recordings/responses/23506e73bb9e.json
+++ b/tests/integration/recordings/responses/23506e73bb9e.json
@ -0,0 +1,422 @@
+{
+  "request": {
+    "method": "POST",
+    "url": "http://0.0.0.0:11434/v1/v1/embeddings",
+    "headers": {},
+    "body": {
+      "model": "all-minilm:l6-v2",
+      "input": [
+        "This is a test file 1"
+      ],
+      "encoding_format": "float"
+    },
+    "endpoint": "/v1/embeddings",
+    "model": "all-minilm:l6-v2"
+  },
+  "response": {
+    "body": {
+      "__type__": "openai.types.create_embedding_response.CreateEmbeddingResponse",
+      "__data__": {
+        "data": [
+          {
+            "embedding": [
+              -0.055990793,
+              0.076004684,
+              -0.09247725,
+              0.014340361,
+              0.058780864,
+              -0.032434482,
+              0.020954052,
+              0.028818125,
+              -0.06591213,
+              0.013541593,
+              0.12999941,
+              0.004603084,
+              -0.0069239275,
+              -0.055457443,
+              -0.047553156,
+              -0.029139794,
+              -0.12236376,
+              -0.05360872,
+              -0.014706594,
+              0.05984688,
+              0.034442738,
+              0.02076038,
+              -0.048697792,
+              0.0135388365,
+              0.058592733,
+              -0.003076384,
+              -0.031565297,
+              0.082541116,
+              -0.031259205,
+              -0.12057633,
+              0.038319625,
+              0.06574785,
+              0.06415721,
+              0.038382582,
+              0.12570712,
+              0.03108174,
+              0.10821103,
+              -0.0019794356,
+              -0.024704305,
+              0.028765837,
+              0.01268161,
+              -0.039844505,
+              0.043253522,
+              -0.015898596,
+              -0.0135526005,
+              -0.0050831717,
+              -0.007911988,
+              0.039783813,
+              0.0036548872,
+              -0.033632487,
+              -0.058547974,
+              0.0048877494,
+              -0.089586094,
+              -0.010457663,
+              0.059202507,
+              -0.020414542,
+              0.014278556,
+              0.013986488,
+              -0.0046022516,
+              0.0383391,
+              0.0048145773,
+              0.029772853,
+              -0.020863408,
+              0.018640704,
+              0.12422993,
+              -0.023236223,
+              -0.040323637,
+              -0.023598222,
+              -0.007448043,
+              -0.09083128,
+              -0.16859712,
+              0.01012451,
+              -0.035808884,
+              0.010595173,
+              -0.02050494,
+              0.0020821376,
+              -0.10925222,
+              0.00793264,
+              0.048889533,
+              -0.11391199,
+              -0.06072707,
+              -0.13435508,
+              0.0063265716,
+              -0.008838073,
+              -0.03153269,
+              0.099169336,
+              0.055310693,
+              0.0068571265,
+              -0.023463152,
+              -0.0031599961,
+              0.036782328,
+              0.014336826,
+              0.022220163,
+              0.047114056,
+              0.007079763,
+              0.06806425,
+              0.01851431,
+              0.040882625,
+              0.055058856,
+              0.09488346,
+              -0.015833577,
+              -7.924328e-05,
+              0.010821554,
+              0.09177704,
+              -0.07464829,
+              -0.06471165,
+              0.07013805,
+              -0.04499751,
+              0.057702336,
+              -0.0260911,
+              0.006323043,
+              -0.09500501,
+              -0.010549514,
+              -0.07887475,
+              0.039744847,
+              -0.04154404,
+              -0.055268157,
+              0.07540271,
+              -0.04667509,
+              0.036143072,
+              0.080297194,
+              -0.036381353,
+              -0.03477274,
+              0.01701203,
+              -0.047007203,
+              -0.06519774,
+              0.062141683,
+              -4.222482e-33,
+              -0.0017580023,
+              -0.09383388,
+              -0.02982657,
+              0.1257841,
+              0.03802007,
+              -0.03654342,
+              0.0060920226,
+              0.05906885,
+              -0.11074452,
+              0.005664566,
+              -0.0259852,
+              -0.074819505,
+              0.008342821,
+              0.027451068,
+              -0.05248069,
+              0.02401768,
+              -0.004380289,
+              0.039321493,
+              -0.04213744,
+              -0.027290314,
+              0.054677974,
+              0.02707243,
+              -0.03329442,
+              -0.060589895,
+              -0.050737355,
+              0.017969057,
+              -0.0035060972,
+              -0.04666249,
+              0.073946096,
+              0.01333894,
+              -0.0033873583,
+              -0.046544433,
+              -0.060105033,
+              0.03406923,
+              0.001542676,
+              0.039177947,
+              0.03989323,
+              -0.012346489,
+              -0.030511485,
+              -0.0019157606,
+              -0.014608986,
+              -0.012997742,
+              0.019522104,
+              -0.022349002,
+              0.074362256,
+              -0.053366993,
+              -0.023993475,
+              0.029225096,
+              0.027534606,
+              0.015111057,
+              -0.020442221,
+              0.043327376,
+              0.019660354,
+              0.017330697,
+              -0.0035011724,
+              0.019482937,
+              -0.0003428041,
+              0.0004143988,
+              -0.005117252,
+              0.06624799,
+              0.027922852,
+              0.041020587,
+              -0.067166425,
+              0.028737254,
+              -0.03478325,
+              -0.055551115,
+              -0.032713737,
+              -0.08099247,
+              0.09216284,
+              0.06395264,
+              -0.049168136,
+              -0.039908994,
+              0.036915958,
+              -0.001602359,
+              0.00033041168,
+              -0.026015632,
+              -0.005999889,
+              0.05474541,
+              -0.09568287,
+              -0.05186289,
+              -0.048838183,
+              -0.08639551,
+              -0.034023147,
+              -0.033257127,
+              -0.05651867,
+              -0.051131375,
+              0.00809173,
+              -0.08581851,
+              0.06507323,
+              -0.085427366,
+              0.027997404,
+              0.029847065,
+              -0.031673994,
+              -0.08560956,
+              0.1017672,
+              2.1855676e-33,
+              0.01160785,
+              0.077607885,
+              -0.017380483,
+              0.005239329,
+              0.0009684126,
+              0.06543702,
+              0.07256893,
+              -0.044318836,
+              -0.04749324,
+              0.14031002,
+              -0.025741624,
+              0.0057860985,
+              0.040946104,
+              -0.054880083,
+              0.074413285,
+              -0.023610368,
+              0.018364722,
+              -0.060585637,
+              -0.044149306,
+              0.0027854694,
+              -0.04580664,
+              0.1172219,
+              0.10268574,
+              0.07907412,
+              -0.0466143,
+              0.018618405,
+              0.029834948,
+              0.037265483,
+              0.02273822,
+              -0.0026589038,
+              0.041726097,
+              0.06439532,
+              -0.089163445,
+              0.018188318,
+              0.024064727,
+              -0.096389584,
+              0.08642254,
+              -0.05389359,
+              0.01923105,
+              0.045092683,
+              0.045125954,
+              0.09655961,
+              0.014908797,
+              0.059611585,
+              0.03066662,
+              0.05882299,
+              0.111484826,
+              0.016632542,
+              0.011590394,
+              -0.023702666,
+              -0.008617484,
+              -0.055030316,
+              0.047606383,
+              -0.014632687,
+              -0.014156344,
+              0.069926,
+              0.032047603,
+              0.042642817,
+              -0.053942375,
+              0.031047028,
+              0.009216673,
+              0.033024028,
+              -0.019033706,
+              0.005568194,
+              -0.014985451,
+              -0.09193244,
+              -0.03210824,
+              0.015367608,
+              0.029150328,
+              0.01250386,
+              -0.004827391,
+              0.023345906,
+              -0.028271332,
+              -0.08454125,
+              0.051068563,
+              -0.0133641455,
+              -0.029022738,
+              -0.02258452,
+              0.010884119,
+              -0.009810021,
+              0.049751773,
+              -0.0032637494,
+              -0.038813565,
+              0.027924104,
+              0.017925078,
+              0.005337612,
+              0.058691237,
+              0.09577674,
+              -0.014308608,
+              0.006972794,
+              -0.02733344,
+              0.06912433,
+              0.05727631,
+              0.03206042,
+              0.0042422824,
+              -1.6766318e-08,
+              -0.036354303,
+              -0.09146416,
+              -0.026319364,
+              -0.007941995,
+              -0.024127059,
+              0.09896698,
+              -0.04723083,
+              -0.03767135,
+              -0.029419973,
+              -0.022513283,
+              0.04125822,
+              -0.0011487947,
+              -0.05570366,
+              0.020679709,
+              -0.038118906,
+              -0.0524994,
+              -0.02624128,
+              -0.05336954,
+              -0.040593866,
+              -0.0073642326,
+              -0.0014442836,
+              0.02714257,
+              0.027141048,
+              0.00932513,
+              -0.00026505854,
+              0.038233075,
+              0.037096914,
+              0.08405413,
+              -0.06340637,
+              -0.014856458,
+              0.05038612,
+              0.06703033,
+              0.027668556,
+              -0.04360097,
+              -0.012041474,
+              0.08500689,
+              0.111594744,
+              0.1046117,
+              0.019726463,
+              -0.0003025109,
+              -0.04110389,
+              0.009575226,
+              -0.05285304,
+              -0.0026365265,
+              -0.031144748,
+              -0.08860188,
+              -0.06762232,
+              -0.07451522,
+              -0.053012833,
+              -0.09560941,
+              -0.05273455,
+              0.013032144,
+              0.0029190276,
+              0.041905046,
+              -0.04522114,
+              0.016730292,
+              0.017214278,
+              0.021578068,
+              -0.03718778,
+              0.02353425,
+              0.052041385,
+              0.06444499,
+              0.02387539,
+              -0.025236009
+            ],
+            "index": 0,
+            "object": "embedding"
+          }
+        ],
+        "model": "all-minilm:l6-v2",
+        "object": "list",
+        "usage": {
+          "prompt_tokens": 6,
+          "total_tokens": 6
+        }
+      }
+    },
+    "is_streaming": false
+  }
+}
--- a/tests/integration/recordings/responses/2afe3b38ca01.json
+++ b/tests/integration/recordings/responses/2afe3b38ca01.json
@ -22,7 +22,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:01.887809Z",
+          "created_at": "2025-09-03T17:37:50.436472Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -40,7 +40,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:01.942369Z",
+          "created_at": "2025-09-03T17:37:50.478138Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -58,7 +58,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:01.99605Z",
+          "created_at": "2025-09-03T17:37:50.519952Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -76,7 +76,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.049974Z",
+          "created_at": "2025-09-03T17:37:50.561433Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -94,7 +94,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.102027Z",
+          "created_at": "2025-09-03T17:37:50.603624Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -112,7 +112,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.158416Z",
+          "created_at": "2025-09-03T17:37:50.645851Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -130,7 +130,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.211753Z",
+          "created_at": "2025-09-03T17:37:50.688403Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -148,7 +148,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.265564Z",
+          "created_at": "2025-09-03T17:37:50.72991Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -166,7 +166,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.31618Z",
+          "created_at": "2025-09-03T17:37:50.771635Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -184,7 +184,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.370325Z",
+          "created_at": "2025-09-03T17:37:50.813711Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -202,7 +202,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.424667Z",
+          "created_at": "2025-09-03T17:37:50.856201Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -220,7 +220,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.47913Z",
+          "created_at": "2025-09-03T17:37:50.899048Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -238,15 +238,15 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:02.536984Z",
+          "created_at": "2025-09-03T17:37:50.94069Z",
          "done": true,
          "done_reason": "stop",
-          "total_duration": 1042724125,
-          "load_duration": 86161375,
+          "total_duration": 688370708,
+          "load_duration": 107469833,
          "prompt_eval_count": 399,
-          "prompt_eval_duration": 305000000,
+          "prompt_eval_duration": 74988334,
          "eval_count": 13,
-          "eval_duration": 650000000,
+          "eval_duration": 505216458,
          "response": "",
          "thinking": null,
          "context": null
--- a/tests/integration/recordings/responses/2d187a11704c.json
+++ b/tests/integration/recordings/responses/2d187a11704c.json
@ -22,7 +22,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:11.938867Z",
+          "created_at": "2025-09-03T17:37:56.566151Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -40,7 +40,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:11.991247Z",
+          "created_at": "2025-09-03T17:37:56.609308Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -58,7 +58,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.043953Z",
+          "created_at": "2025-09-03T17:37:56.651314Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -76,7 +76,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.096001Z",
+          "created_at": "2025-09-03T17:37:56.693185Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -94,7 +94,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.150454Z",
+          "created_at": "2025-09-03T17:37:56.734643Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -112,7 +112,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.201249Z",
+          "created_at": "2025-09-03T17:37:56.776343Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -130,7 +130,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.252534Z",
+          "created_at": "2025-09-03T17:37:56.81705Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -148,7 +148,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.30063Z",
+          "created_at": "2025-09-03T17:37:56.857959Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -166,7 +166,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.351034Z",
+          "created_at": "2025-09-03T17:37:56.899424Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -184,7 +184,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.405032Z",
+          "created_at": "2025-09-03T17:37:56.939218Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -202,7 +202,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.462645Z",
+          "created_at": "2025-09-03T17:37:56.980065Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -220,7 +220,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.520337Z",
+          "created_at": "2025-09-03T17:37:57.02214Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -238,7 +238,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.575809Z",
+          "created_at": "2025-09-03T17:37:57.0628Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -256,7 +256,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.633724Z",
+          "created_at": "2025-09-03T17:37:57.106061Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -274,7 +274,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.683133Z",
+          "created_at": "2025-09-03T17:37:57.1492Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -292,7 +292,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.734309Z",
+          "created_at": "2025-09-03T17:37:57.190075Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -310,7 +310,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.785917Z",
+          "created_at": "2025-09-03T17:37:57.23178Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -328,7 +328,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.835705Z",
+          "created_at": "2025-09-03T17:37:57.272738Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -346,7 +346,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.886509Z",
+          "created_at": "2025-09-03T17:37:57.313855Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -364,7 +364,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.937134Z",
+          "created_at": "2025-09-03T17:37:57.354964Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -382,7 +382,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:12.988532Z",
+          "created_at": "2025-09-03T17:37:57.395971Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -400,7 +400,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.041798Z",
+          "created_at": "2025-09-03T17:37:57.438471Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -418,7 +418,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.095443Z",
+          "created_at": "2025-09-03T17:37:57.479796Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -436,7 +436,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.151402Z",
+          "created_at": "2025-09-03T17:37:57.520641Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -454,7 +454,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.203462Z",
+          "created_at": "2025-09-03T17:37:57.561511Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -472,7 +472,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.254567Z",
+          "created_at": "2025-09-03T17:37:57.602875Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -490,7 +490,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.305865Z",
+          "created_at": "2025-09-03T17:37:57.643406Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -508,7 +508,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.357658Z",
+          "created_at": "2025-09-03T17:37:57.684279Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -526,7 +526,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.407773Z",
+          "created_at": "2025-09-03T17:37:57.725699Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -544,7 +544,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.458919Z",
+          "created_at": "2025-09-03T17:37:57.766658Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -562,7 +562,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.510456Z",
+          "created_at": "2025-09-03T17:37:57.80738Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -580,7 +580,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.565948Z",
+          "created_at": "2025-09-03T17:37:57.848466Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -598,7 +598,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.619155Z",
+          "created_at": "2025-09-03T17:37:57.889056Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -616,7 +616,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.672754Z",
+          "created_at": "2025-09-03T17:37:57.931554Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -634,7 +634,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.729473Z",
+          "created_at": "2025-09-03T17:37:57.974754Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -652,7 +652,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.788666Z",
+          "created_at": "2025-09-03T17:37:58.016978Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -670,7 +670,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.850575Z",
+          "created_at": "2025-09-03T17:37:58.057942Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -688,7 +688,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.904807Z",
+          "created_at": "2025-09-03T17:37:58.099015Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -706,7 +706,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:13.958524Z",
+          "created_at": "2025-09-03T17:37:58.140531Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -724,7 +724,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.011742Z",
+          "created_at": "2025-09-03T17:37:58.181382Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -742,7 +742,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.064933Z",
+          "created_at": "2025-09-03T17:37:58.223318Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -760,7 +760,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.116454Z",
+          "created_at": "2025-09-03T17:37:58.26358Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -778,7 +778,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.172682Z",
+          "created_at": "2025-09-03T17:37:58.305496Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -796,7 +796,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.227654Z",
+          "created_at": "2025-09-03T17:37:58.347254Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -814,7 +814,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.282068Z",
+          "created_at": "2025-09-03T17:37:58.390044Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -832,7 +832,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.334565Z",
+          "created_at": "2025-09-03T17:37:58.430867Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -850,7 +850,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.383532Z",
+          "created_at": "2025-09-03T17:37:58.471376Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -868,7 +868,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.432138Z",
+          "created_at": "2025-09-03T17:37:58.51208Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -886,7 +886,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.480995Z",
+          "created_at": "2025-09-03T17:37:58.553226Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -904,7 +904,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.531968Z",
+          "created_at": "2025-09-03T17:37:58.594787Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -922,7 +922,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.584044Z",
+          "created_at": "2025-09-03T17:37:58.63466Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -940,7 +940,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.635691Z",
+          "created_at": "2025-09-03T17:37:58.674628Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -958,7 +958,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.68837Z",
+          "created_at": "2025-09-03T17:37:58.714616Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -976,7 +976,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.73985Z",
+          "created_at": "2025-09-03T17:37:58.754906Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -994,7 +994,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.792412Z",
+          "created_at": "2025-09-03T17:37:58.795048Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1012,7 +1012,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.845872Z",
+          "created_at": "2025-09-03T17:37:58.835297Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1030,7 +1030,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.900102Z",
+          "created_at": "2025-09-03T17:37:58.875738Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1048,7 +1048,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:14.954589Z",
+          "created_at": "2025-09-03T17:37:58.91604Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1066,7 +1066,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.006629Z",
+          "created_at": "2025-09-03T17:37:58.956596Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1084,7 +1084,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.058561Z",
+          "created_at": "2025-09-03T17:37:58.996664Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1102,7 +1102,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.111954Z",
+          "created_at": "2025-09-03T17:37:59.037796Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1120,7 +1120,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.169173Z",
+          "created_at": "2025-09-03T17:37:59.078586Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1138,7 +1138,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.222569Z",
+          "created_at": "2025-09-03T17:37:59.119448Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1156,7 +1156,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.275795Z",
+          "created_at": "2025-09-03T17:37:59.160318Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1174,7 +1174,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.3327Z",
+          "created_at": "2025-09-03T17:37:59.201852Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1192,7 +1192,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.389931Z",
+          "created_at": "2025-09-03T17:37:59.243763Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1210,7 +1210,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.442349Z",
+          "created_at": "2025-09-03T17:37:59.284948Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1228,7 +1228,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.494175Z",
+          "created_at": "2025-09-03T17:37:59.325598Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1246,7 +1246,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.545764Z",
+          "created_at": "2025-09-03T17:37:59.366289Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1264,7 +1264,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.599099Z",
+          "created_at": "2025-09-03T17:37:59.406764Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1282,7 +1282,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.649852Z",
+          "created_at": "2025-09-03T17:37:59.447922Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1300,7 +1300,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.698222Z",
+          "created_at": "2025-09-03T17:37:59.488486Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1318,7 +1318,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.747168Z",
+          "created_at": "2025-09-03T17:37:59.529Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1336,7 +1336,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.797196Z",
+          "created_at": "2025-09-03T17:37:59.569417Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1354,7 +1354,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.845587Z",
+          "created_at": "2025-09-03T17:37:59.610542Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1372,7 +1372,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.897171Z",
+          "created_at": "2025-09-03T17:37:59.651411Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1390,7 +1390,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.944524Z",
+          "created_at": "2025-09-03T17:37:59.69241Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1408,7 +1408,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:15.994467Z",
+          "created_at": "2025-09-03T17:37:59.732339Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1426,7 +1426,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.045224Z",
+          "created_at": "2025-09-03T17:37:59.772462Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1444,7 +1444,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.093853Z",
+          "created_at": "2025-09-03T17:37:59.812507Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1462,7 +1462,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.144847Z",
+          "created_at": "2025-09-03T17:37:59.852762Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1480,7 +1480,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.197888Z",
+          "created_at": "2025-09-03T17:37:59.892984Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1498,7 +1498,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.250854Z",
+          "created_at": "2025-09-03T17:37:59.933555Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1516,7 +1516,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.301995Z",
+          "created_at": "2025-09-03T17:37:59.973778Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1534,7 +1534,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.352508Z",
+          "created_at": "2025-09-03T17:38:00.014923Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1552,7 +1552,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.40259Z",
+          "created_at": "2025-09-03T17:38:00.057464Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1570,7 +1570,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.453514Z",
+          "created_at": "2025-09-03T17:38:00.09902Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1588,7 +1588,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.50378Z",
+          "created_at": "2025-09-03T17:38:00.140492Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1606,7 +1606,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.554395Z",
+          "created_at": "2025-09-03T17:38:00.180239Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1624,7 +1624,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.605795Z",
+          "created_at": "2025-09-03T17:38:00.220364Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1642,7 +1642,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.656313Z",
+          "created_at": "2025-09-03T17:38:00.26097Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1660,7 +1660,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.706438Z",
+          "created_at": "2025-09-03T17:38:00.301228Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1678,7 +1678,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.756444Z",
+          "created_at": "2025-09-03T17:38:00.341631Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1696,7 +1696,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.807687Z",
+          "created_at": "2025-09-03T17:38:00.383006Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1714,7 +1714,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.85835Z",
+          "created_at": "2025-09-03T17:38:00.423509Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1732,7 +1732,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.909311Z",
+          "created_at": "2025-09-03T17:38:00.464702Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1750,7 +1750,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:16.959327Z",
+          "created_at": "2025-09-03T17:38:00.505914Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1768,7 +1768,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:17.010211Z",
+          "created_at": "2025-09-03T17:38:00.546505Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1786,7 +1786,7 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:17.061365Z",
+          "created_at": "2025-09-03T17:38:00.587839Z",
          "done": false,
          "done_reason": null,
          "total_duration": null,
@ -1804,15 +1804,15 @@
        "__type__": "ollama._types.GenerateResponse",
        "__data__": {
          "model": "llama3.2:3b-instruct-fp16",
-          "created_at": "2025-08-01T23:14:17.111956Z",
+          "created_at": "2025-09-03T17:38:00.629018Z",
          "done": true,
          "done_reason": "stop",
-          "total_duration": 5499672375,
-          "load_duration": 58161750,
+          "total_duration": 4303339291,
+          "load_duration": 156231250,
          "prompt_eval_count": 36,
-          "prompt_eval_duration": 266000000,
+          "prompt_eval_duration": 81909875,
          "eval_count": 100,
-          "eval_duration": 5174000000,
+          "eval_duration": 4064559292,
          "response": "",
          "thinking": null,
          "context": null
--- a/tests/integration/recordings/responses/325a72db5755.json
+++ b/tests/integration/recordings/responses/325a72db5755.json
--- a/tests/integration/recordings/responses/382c2f22274c.json
+++ b/tests/integration/recordings/responses/382c2f22274c.json
@ -1,7 +1,7 @@
 {
  "request": {
    "method": "POST",
-    "url": "http://localhost:11434/v1/v1/chat/completions",
+    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
@ -22,14 +22,14 @@
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
-        "id": "chatcmpl-339",
+        "id": "chatcmpl-442",
        "choices": [
          {
            "finish_reason": "length",
            "index": 0,
            "logprobs": null,
            "message": {
-              "content": "I can guide you through the process, but please note that this is not an official OpenAI API call. OpenAI's API terms and conditions prohibit using their models for malicious purposes.\n\nTo test a model like \"text-temperature\" with a temperature of 0 (i.e., no noise or randomness), we'll need to use a third-party library that connects to the OpenAI API. One such library is `transformers`.\n\nFirst, you need to install the `transformers` and `",
+              "content": "I can guide you on how to use the `test-temperature` parameter with OpenAI's API, but please note that using a temperature of 0 may not produce meaningful results. Temperature is a hyperparameter that controls the level of randomness in the model's output.\n\nOpenAI's API uses a variant of the GPT-3 model, which is trained on a large corpus of text data. The `test-temperature` parameter allows you to adjust the level of randomness in the model's output",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
@ -39,7 +39,7 @@
            }
          }
        ],
-        "created": 1754510065,
+        "created": 1756921254,
        "model": "llama3.2:3b-instruct-fp16",
        "object": "chat.completion",
        "service_tier": null,
--- a/Show more
+++ b/Show more