Merge remote-tracking branch 'upstream/main' into elasticsearch-integration

2025-12-03 09:53:45 +00:00 · 2025-10-31 18:23:42 +01:00 · 2025-10-31 18:23:42 +01:00 · 2407115ee8
commit 2407115ee8
parent 22b27e6275 5f95c1f8cc
1050 changed files with 65153 additions and 2821 deletions
--- a/.github/actions/install-llama-stack-client/action.yml
+++ b/.github/actions/install-llama-stack-client/action.yml
@ -0,0 +1,64 @@
 name: Install llama-stack-client
 description: Install llama-stack-client based on branch context and client-version input
 inputs:
  client-version:
    description: 'Client version to install on non-release branches (latest or published). Ignored on release branches.'
    required: false
    default: ""
 outputs:
  uv-index-url:
    description: 'UV_INDEX_URL to use (set for release branches)'
    value: ${{ steps.configure.outputs.uv-index-url }}
  uv-extra-index-url:
    description: 'UV_EXTRA_INDEX_URL to use (set for release branches)'
    value: ${{ steps.configure.outputs.uv-extra-index-url }}
  install-after-sync:
    description: 'Whether to install client after uv sync'
    value: ${{ steps.configure.outputs.install-after-sync }}
  install-source:
    description: 'Where to install client from after sync'
    value: ${{ steps.configure.outputs.install-source }}
 runs:
  using: "composite"
  steps:
    - name: Configure client installation
      id: configure
      shell: bash
      run: |
        # Determine the branch we're working with
        BRANCH="${{ github.base_ref || github.ref }}"
        BRANCH="${BRANCH#refs/heads/}"
        echo "Working with branch: $BRANCH"
        # On release branches: use test.pypi for uv sync, then install from git
        # On non-release branches: install based on client-version after sync
        if [[ "$BRANCH" =~ ^release-[0-9]+\.[0-9]+\.x$ ]]; then
          echo "Detected release branch: $BRANCH"
          # Check if matching branch exists in client repo
          if ! git ls-remote --exit-code --heads https://github.com/llamastack/llama-stack-client-python.git "$BRANCH" > /dev/null 2>&1; then
            echo "::error::Branch $BRANCH not found in llama-stack-client-python repository"
            echo "::error::Please create the matching release branch in llama-stack-client-python before testing"
            exit 1
          fi
          # Configure to use test.pypi for sync (to resolve RC versions)
          echo "uv-index-url=https://test.pypi.org/simple/" >> $GITHUB_OUTPUT
          echo "uv-extra-index-url=https://pypi.org/simple/" >> $GITHUB_OUTPUT
          echo "install-after-sync=true" >> $GITHUB_OUTPUT
          echo "install-source=git+https://github.com/llamastack/llama-stack-client-python.git@$BRANCH" >> $GITHUB_OUTPUT
        elif [ "${{ inputs.client-version }}" = "latest" ]; then
          # Install from main git after sync
          echo "install-after-sync=true" >> $GITHUB_OUTPUT
          echo "install-source=git+https://github.com/llamastack/llama-stack-client-python.git@main" >> $GITHUB_OUTPUT
        elif [ "${{ inputs.client-version }}" = "published" ]; then
          # Use published version from PyPI (installed by sync)
          echo "install-after-sync=false" >> $GITHUB_OUTPUT
        elif [ -n "${{ inputs.client-version }}" ]; then
          echo "::error::Invalid client-version: ${{ inputs.client-version }}"
          exit 1
        fi
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -94,7 +94,7 @@ runs:
      if: ${{ always() }}
      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
      with:
-        name: logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ strategy.job-index }}
+        name: logs-${{ github.run_id }}-${{ github.run_attempt || '1' }}-${{ strategy.job-index || github.job }}-${{ github.action }}
        path: |
          *.log
        retention-days: 1
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -18,8 +18,17 @@ runs:
        python-version: ${{ inputs.python-version }}
        version: 0.7.6
    - name: Configure client installation
      id: client-config
      uses: ./.github/actions/install-llama-stack-client
      with:
        client-version: ${{ inputs.client-version }}
    - name: Install dependencies
      shell: bash
      env:
        UV_INDEX_URL: ${{ steps.client-config.outputs.uv-index-url }}
        UV_EXTRA_INDEX_URL: ${{ steps.client-config.outputs.uv-extra-index-url }}
      run: |
        echo "Updating project dependencies via uv sync"
        uv sync --all-groups
@ -27,16 +36,10 @@ runs:
        echo "Installing ad-hoc dependencies"
        uv pip install faiss-cpu
-        # Install llama-stack-client-python based on the client-version input
+        # Install specific client version after sync if needed
-        if [ "${{ inputs.client-version }}" = "latest" ]; then
+        if [ "${{ steps.client-config.outputs.install-after-sync }}" = "true" ]; then
-          echo "Installing latest llama-stack-client-python from main branch"
+          echo "Installing llama-stack-client from: ${{ steps.client-config.outputs.install-source }}"
-          uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
+          uv pip install ${{ steps.client-config.outputs.install-source }}
        elif [ "${{ inputs.client-version }}" = "published" ]; then
          echo "Installing published llama-stack-client-python from PyPI"
          uv pip install llama-stack-client
        else
          echo "Invalid client-version: ${{ inputs.client-version }}"
          exit 1
        fi
        echo "Installed llama packages"
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@ -42,18 +42,7 @@ runs:
    - name: Build Llama Stack
      shell: bash
      run: |
-        # Install llama-stack-client-python based on the client-version input
+        # Client is already installed by setup-runner (handles both main and release branches)
        if [ "${{ inputs.client-version }}" = "latest" ]; then
          echo "Installing latest llama-stack-client-python from main branch"
          export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
        elif [ "${{ inputs.client-version }}" = "published" ]; then
          echo "Installing published llama-stack-client-python from PyPI"
          unset LLAMA_STACK_CLIENT_DIR
        else
          echo "Invalid client-version: ${{ inputs.client-version }}"
          exit 1
        fi
        echo "Building Llama Stack"
        LLAMA_STACK_DIR=. \
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -4,6 +4,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Name | File | Purpose |
 | ---- | ---- | ------- |
 | Backward Compatibility Check | [backward-compat.yml](backward-compat.yml) | Check backward compatibility for run.yaml configs |
 | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
 | API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. |
 | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
--- a/.github/workflows/backward-compat.yml
+++ b/.github/workflows/backward-compat.yml
@ -0,0 +1,578 @@
 name: Backward Compatibility Check
 run-name: Check backward compatibility for run.yaml configs
 on:
  pull_request:
    branches:
      - main
      - 'release-[0-9]+.[0-9]+.[0-9]+.[0-9]+'
      - 'release-[0-9]+.[0-9]+.[0-9]+'
      - 'release-[0-9]+.[0-9]+'
    paths:
      - 'src/llama_stack/core/datatypes.py'
      - 'src/llama_stack/providers/datatypes.py'
      - 'src/llama_stack/distributions/**/run.yaml'
      - 'tests/backward_compat/**'
      - '.github/workflows/backward-compat.yml'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  check-main-compatibility:
    name: Check Compatibility with main
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PR branch
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0  # Need full history to access main branch
      - name: Set up Python
        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
        with:
          python-version: '3.12'
      - name: Install uv
        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
        with:
          enable-cache: true
      - name: Install dependencies
        run: |
          uv sync --group dev
      - name: Extract run.yaml files from main branch
        id: extract_configs
        run: |
          # Get list of run.yaml paths from main
          git fetch origin main
          CONFIG_PATHS=$(git ls-tree -r --name-only origin/main | grep "src/llama_stack/distributions/.*/run.yaml$" || true)
          if [ -z "$CONFIG_PATHS" ]; then
            echo "No run.yaml files found in main branch"
            exit 1
          fi
          # Extract all configs to a temp directory
          mkdir -p /tmp/main_configs
          echo "Extracting configs from main branch:"
          while IFS= read -r config_path; do
            if [ -z "$config_path" ]; then
              continue
            fi
            # Extract filename for storage
            filename=$(basename $(dirname "$config_path"))
            echo "  - $filename (from $config_path)"
            git show origin/main:"$config_path" > "/tmp/main_configs/${filename}.yaml"
          done <<< "$CONFIG_PATHS"
          echo ""
          echo "Extracted $(ls /tmp/main_configs/*.yaml | wc -l) config files"
      - name: Test all configs from main
        id: test_configs
        continue-on-error: true
        run: |
          # Run pytest once with all configs parameterized
          if COMPAT_TEST_CONFIGS_DIR=/tmp/main_configs uv run pytest tests/backward_compat/test_run_config.py -v; then
            echo "failed=false" >> $GITHUB_OUTPUT
          else
            echo "failed=true" >> $GITHUB_OUTPUT
            exit 1
          fi
      - name: Check for breaking change acknowledgment
        id: check_ack
        if: steps.test_configs.outputs.failed == 'true'
        run: |
          echo "Breaking changes detected. Checking for acknowledgment..."
          # Check PR title for '!:' marker (conventional commits)
          PR_TITLE="${{ github.event.pull_request.title }}"
          if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then
            echo "✓ Breaking change acknowledged in PR title"
            echo "acknowledged=true" >> $GITHUB_OUTPUT
            exit 0
          fi
          # Check commit messages for BREAKING CHANGE:
          if git log origin/main..HEAD --format=%B | grep -q "BREAKING CHANGE:"; then
            echo "✓ Breaking change acknowledged in commit message"
            echo "acknowledged=true" >> $GITHUB_OUTPUT
            exit 0
          fi
          echo "✗ Breaking change NOT acknowledged"
          echo "acknowledged=false" >> $GITHUB_OUTPUT
        env:
          GH_TOKEN: ${{ github.token }}
      - name: Evaluate results
        if: always()
        run: |
          FAILED="${{ steps.test_configs.outputs.failed }}"
          ACKNOWLEDGED="${{ steps.check_ack.outputs.acknowledged }}"
          if [[ "$FAILED" == "true" ]]; then
            if [[ "$ACKNOWLEDGED" == "true" ]]; then
              echo ""
              echo "⚠️  WARNING: Breaking changes detected but acknowledged"
              echo ""
              echo "This PR introduces backward-incompatible changes to run.yaml."
              echo "The changes have been properly acknowledged."
              echo ""
              exit 0  # Pass the check
            else
              echo ""
              echo "❌ ERROR: Breaking changes detected without acknowledgment"
              echo ""
              echo "This PR introduces backward-incompatible changes to run.yaml"
              echo "that will break existing user configurations."
              echo ""
              echo "To acknowledge this breaking change, do ONE of:"
              echo "  1. Add '!:' to your PR title (e.g., 'feat!: change xyz')"
              echo "  2. Add the 'breaking-change' label to this PR"
              echo "  3. Include 'BREAKING CHANGE:' in a commit message"
              echo ""
              exit 1  # Fail the check
            fi
          fi
  test-integration-main:
    name: Run Integration Tests with main Config
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PR branch
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
      - name: Extract ci-tests run.yaml from main
        run: |
          git fetch origin main
          git show origin/main:src/llama_stack/distributions/ci-tests/run.yaml > /tmp/main-ci-tests-run.yaml
          echo "Extracted ci-tests run.yaml from main branch"
      - name: Setup test environment
        uses: ./.github/actions/setup-test-environment
        with:
          python-version: '3.12'
          client-version: 'latest'
          setup: 'ollama'
          suite: 'base'
          inference-mode: 'replay'
      - name: Run integration tests with main config
        id: test_integration
        continue-on-error: true
        uses: ./.github/actions/run-and-record-tests
        with:
          stack-config: /tmp/main-ci-tests-run.yaml
          setup: 'ollama'
          inference-mode: 'replay'
          suite: 'base'
      - name: Check for breaking change acknowledgment
        id: check_ack
        if: steps.test_integration.outcome == 'failure'
        run: |
          echo "Integration tests failed. Checking for acknowledgment..."
          # Check PR title for '!:' marker (conventional commits)
          PR_TITLE="${{ github.event.pull_request.title }}"
          if [[ "$PR_TITLE" =~ ^[a-z]+\!: ]]; then
            echo "✓ Breaking change acknowledged in PR title"
            echo "acknowledged=true" >> $GITHUB_OUTPUT
            exit 0
          fi
          # Check commit messages for BREAKING CHANGE:
          if git log origin/main..HEAD --format=%B | grep -q "BREAKING CHANGE:"; then
            echo "✓ Breaking change acknowledged in commit message"
            echo "acknowledged=true" >> $GITHUB_OUTPUT
            exit 0
          fi
          echo "✗ Breaking change NOT acknowledged"
          echo "acknowledged=false" >> $GITHUB_OUTPUT
        env:
          GH_TOKEN: ${{ github.token }}
      - name: Evaluate integration test results
        if: always()
        run: |
          TEST_FAILED="${{ steps.test_integration.outcome == 'failure' }}"
          ACKNOWLEDGED="${{ steps.check_ack.outputs.acknowledged }}"
          if [[ "$TEST_FAILED" == "true" ]]; then
            if [[ "$ACKNOWLEDGED" == "true" ]]; then
              echo ""
              echo "⚠️  WARNING: Integration tests failed with main config but acknowledged"
              echo ""
              exit 0  # Pass the check
            else
              echo ""
              echo "❌ ERROR: Integration tests failed with main config without acknowledgment"
              echo ""
              echo "To acknowledge this breaking change, do ONE of:"
              echo "  1. Add '!:' to your PR title (e.g., 'feat!: change xyz')"
              echo "  2. Include 'BREAKING CHANGE:' in a commit message"
              echo ""
              exit 1  # Fail the check
            fi
          fi
  test-integration-release:
    name: Run Integration Tests with Latest Release (Informational)
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PR branch
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
      - name: Get latest release
        id: get_release
        run: |
          # Get the latest release from GitHub
          LATEST_TAG=$(gh release list --limit 1 --json tagName --jq '.[0].tagName' 2>/dev/null || echo "")
          if [ -z "$LATEST_TAG" ]; then
            echo "No releases found, skipping release compatibility check"
            echo "has_release=false" >> $GITHUB_OUTPUT
            exit 0
          fi
          echo "Latest release: $LATEST_TAG"
          echo "has_release=true" >> $GITHUB_OUTPUT
          echo "tag=$LATEST_TAG" >> $GITHUB_OUTPUT
        env:
          GH_TOKEN: ${{ github.token }}
      - name: Extract ci-tests run.yaml from release
        if: steps.get_release.outputs.has_release == 'true'
        id: extract_config
        run: |
          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
          # Try with src/ prefix first (newer releases), then without (older releases)
          if git show "$RELEASE_TAG:src/llama_stack/distributions/ci-tests/run.yaml" > /tmp/release-ci-tests-run.yaml 2>/dev/null; then
            echo "Extracted ci-tests run.yaml from release $RELEASE_TAG (src/ path)"
            echo "has_config=true" >> $GITHUB_OUTPUT
          elif git show "$RELEASE_TAG:llama_stack/distributions/ci-tests/run.yaml" > /tmp/release-ci-tests-run.yaml 2>/dev/null; then
            echo "Extracted ci-tests run.yaml from release $RELEASE_TAG (old path)"
            echo "has_config=true" >> $GITHUB_OUTPUT
          else
            echo "::warning::ci-tests/run.yaml not found in release $RELEASE_TAG"
            echo "has_config=false" >> $GITHUB_OUTPUT
          fi
      - name: Setup test environment
        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
        uses: ./.github/actions/setup-test-environment
        with:
          python-version: '3.12'
          client-version: 'latest'
          setup: 'ollama'
          suite: 'base'
          inference-mode: 'replay'
      - name: Run integration tests with release config (PR branch)
        id: test_release_pr
        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
        continue-on-error: true
        uses: ./.github/actions/run-and-record-tests
        with:
          stack-config: /tmp/release-ci-tests-run.yaml
          setup: 'ollama'
          inference-mode: 'replay'
          suite: 'base'
      - name: Checkout main branch to test baseline
        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
        run: |
          git checkout origin/main
      - name: Setup test environment for main
        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
        uses: ./.github/actions/setup-test-environment
        with:
          python-version: '3.12'
          client-version: 'latest'
          setup: 'ollama'
          suite: 'base'
          inference-mode: 'replay'
      - name: Run integration tests with release config (main branch)
        id: test_release_main
        if: steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
        continue-on-error: true
        uses: ./.github/actions/run-and-record-tests
        with:
          stack-config: /tmp/release-ci-tests-run.yaml
          setup: 'ollama'
          inference-mode: 'replay'
          suite: 'base'
      - name: Report results and post PR comment
        if: always() && steps.get_release.outputs.has_release == 'true' && steps.extract_config.outputs.has_config == 'true'
        run: |
          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
          PR_OUTCOME="${{ steps.test_release_pr.outcome }}"
          MAIN_OUTCOME="${{ steps.test_release_main.outcome }}"
          if [[ "$PR_OUTCOME" == "failure" && "$MAIN_OUTCOME" == "success" ]]; then
            # NEW breaking change - PR fails but main passes
            echo "::error::🚨 This PR introduces a NEW breaking change!"
            # Check if we already posted a comment (to avoid spam on every push)
            EXISTING_COMMENT=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("🚨 New Breaking Change Detected") and contains("Integration tests")) | .id' | head -1)
            if [[ -z "$EXISTING_COMMENT" ]]; then
              gh pr comment ${{ github.event.pull_request.number }} --body "## 🚨 New Breaking Change Detected
          **Integration tests against release \`$RELEASE_TAG\` are now failing**
          ⚠️  This PR introduces a breaking change that affects compatibility with the latest release.
          - Users on release \`$RELEASE_TAG\` may not be able to upgrade
          - Existing configurations may break
          The tests pass on \`main\` but fail with this PR's changes.
          > **Note:** This is informational only and does not block merge.
          > Consider whether this breaking change is acceptable for users."
            else
              echo "Comment already exists, skipping to avoid spam"
            fi
            cat >> $GITHUB_STEP_SUMMARY <<EOF
          ## 🚨 NEW Breaking Change Detected
          **Integration tests against release \`$RELEASE_TAG\` FAILED**
          ⚠️  **This PR introduces a NEW breaking change**
          - Tests **PASS** on main branch ✅
          - Tests **FAIL** on PR branch ❌
          - Users on release \`$RELEASE_TAG\` may not be able to upgrade
          - Existing configurations may break
          > **Note:** This is informational only and does not block merge.
          > Consider whether this breaking change is acceptable for users.
          EOF
          elif [[ "$PR_OUTCOME" == "failure" ]]; then
            # Existing breaking change - both PR and main fail
            echo "::warning::Breaking change already exists in main branch"
            cat >> $GITHUB_STEP_SUMMARY <<EOF
          ## ⚠️ Release Compatibility Test Failed (Existing Issue)
          **Integration tests against release \`$RELEASE_TAG\` FAILED**
          - Tests **FAIL** on main branch ❌
          - Tests **FAIL** on PR branch ❌
          - This breaking change already exists in main (not introduced by this PR)
          > **Note:** This is informational only.
          EOF
          else
            # Success - tests pass
            cat >> $GITHUB_STEP_SUMMARY <<EOF
          ## ✅ Release Compatibility Test Passed
          Integration tests against release \`$RELEASE_TAG\` passed successfully.
          This PR maintains compatibility with the latest release.
          EOF
          fi
        env:
          GH_TOKEN: ${{ github.token }}
  check-schema-release-compatibility:
    name: Check Schema Compatibility with Latest Release (Informational)
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PR branch
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
        with:
          python-version: '3.12'
      - name: Install uv
        uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
        with:
          enable-cache: true
      - name: Install dependencies
        run: |
          uv sync --group dev
      - name: Get latest release
        id: get_release
        run: |
          # Get the latest release from GitHub
          LATEST_TAG=$(gh release list --limit 1 --json tagName --jq '.[0].tagName' 2>/dev/null || echo "")
          if [ -z "$LATEST_TAG" ]; then
            echo "No releases found, skipping release compatibility check"
            echo "has_release=false" >> $GITHUB_OUTPUT
            exit 0
          fi
          echo "Latest release: $LATEST_TAG"
          echo "has_release=true" >> $GITHUB_OUTPUT
          echo "tag=$LATEST_TAG" >> $GITHUB_OUTPUT
        env:
          GH_TOKEN: ${{ github.token }}
      - name: Extract configs from release
        if: steps.get_release.outputs.has_release == 'true'
        id: extract_release_configs
        run: |
          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
          # Get run.yaml files from the release (try both src/ and old path)
          CONFIG_PATHS=$(git ls-tree -r --name-only "$RELEASE_TAG" | grep "llama_stack/distributions/.*/run.yaml$" || true)
          if [ -z "$CONFIG_PATHS" ]; then
            echo "::warning::No run.yaml files found in release $RELEASE_TAG"
            echo "has_configs=false" >> $GITHUB_OUTPUT
            exit 0
          fi
          # Extract all configs to a temp directory
          mkdir -p /tmp/release_configs
          echo "Extracting configs from release $RELEASE_TAG:"
          while IFS= read -r config_path; do
            if [ -z "$config_path" ]; then
              continue
            fi
            filename=$(basename $(dirname "$config_path"))
            echo "  - $filename (from $config_path)"
            git show "$RELEASE_TAG:$config_path" > "/tmp/release_configs/${filename}.yaml" 2>/dev/null || true
          done <<< "$CONFIG_PATHS"
          echo ""
          echo "Extracted $(ls /tmp/release_configs/*.yaml 2>/dev/null | wc -l) config files"
          echo "has_configs=true" >> $GITHUB_OUTPUT
      - name: Test against release configs (PR branch)
        id: test_schema_pr
        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
        continue-on-error: true
        run: |
          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
          COMPAT_TEST_CONFIGS_DIR=/tmp/release_configs uv run pytest tests/backward_compat/test_run_config.py -v --tb=short
      - name: Checkout main branch to test baseline
        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
        run: |
          git checkout origin/main
      - name: Install dependencies for main
        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
        run: |
          uv sync --group dev
      - name: Test against release configs (main branch)
        id: test_schema_main
        if: steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
        continue-on-error: true
        run: |
          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
          COMPAT_TEST_CONFIGS_DIR=/tmp/release_configs uv run pytest tests/backward_compat/test_run_config.py -v --tb=short
      - name: Report results and post PR comment
        if: always() && steps.get_release.outputs.has_release == 'true' && steps.extract_release_configs.outputs.has_configs == 'true'
        run: |
          RELEASE_TAG="${{ steps.get_release.outputs.tag }}"
          PR_OUTCOME="${{ steps.test_schema_pr.outcome }}"
          MAIN_OUTCOME="${{ steps.test_schema_main.outcome }}"
          if [[ "$PR_OUTCOME" == "failure" && "$MAIN_OUTCOME" == "success" ]]; then
            # NEW breaking change - PR fails but main passes
            echo "::error::🚨 This PR introduces a NEW schema breaking change!"
            # Check if we already posted a comment (to avoid spam on every push)
            EXISTING_COMMENT=$(gh pr view ${{ github.event.pull_request.number }} --json comments --jq '.comments[] | select(.body | contains("🚨 New Schema Breaking Change Detected")) | .id' | head -1)
            if [[ -z "$EXISTING_COMMENT" ]]; then
              gh pr comment ${{ github.event.pull_request.number }} --body "## 🚨 New Schema Breaking Change Detected
          **Schema validation against release \`$RELEASE_TAG\` is now failing**
          ⚠️  This PR introduces a schema breaking change that affects compatibility with the latest release.
          - Users on release \`$RELEASE_TAG\` will not be able to upgrade
          - Existing run.yaml configurations will fail validation
          The tests pass on \`main\` but fail with this PR's changes.
          > **Note:** This is informational only and does not block merge.
          > Consider whether this breaking change is acceptable for users."
            else
              echo "Comment already exists, skipping to avoid spam"
            fi
            cat >> $GITHUB_STEP_SUMMARY <<EOF
          ## 🚨 NEW Schema Breaking Change Detected
          **Schema validation against release \`$RELEASE_TAG\` FAILED**
          ⚠️  **This PR introduces a NEW schema breaking change**
          - Tests **PASS** on main branch ✅
          - Tests **FAIL** on PR branch ❌
          - Users on release \`$RELEASE_TAG\` will not be able to upgrade
          - Existing run.yaml configurations will fail validation
          > **Note:** This is informational only and does not block merge.
          > Consider whether this breaking change is acceptable for users.
          EOF
          elif [[ "$PR_OUTCOME" == "failure" ]]; then
            # Existing breaking change - both PR and main fail
            echo "::warning::Schema breaking change already exists in main branch"
            cat >> $GITHUB_STEP_SUMMARY <<EOF
          ## ⚠️ Release Schema Compatibility Failed (Existing Issue)
          **Schema validation against release \`$RELEASE_TAG\` FAILED**
          - Tests **FAIL** on main branch ❌
          - Tests **FAIL** on PR branch ❌
          - This schema breaking change already exists in main (not introduced by this PR)
          > **Note:** This is informational only.
          EOF
          else
            # Success - tests pass
            cat >> $GITHUB_STEP_SUMMARY <<EOF
          ## ✅ Release Schema Compatibility Passed
          All run.yaml configs from release \`$RELEASE_TAG\` are compatible.
          This PR maintains backward compatibility with the latest release.
          EOF
          fi
        env:
          GH_TOKEN: ${{ github.token }}
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -4,13 +4,17 @@ run-name: Run the integration test suite with Kubernetes authentication
 on:
  push:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
    paths:
      - 'distributions/**'
-      - 'llama_stack/**'
+      - 'src/llama_stack/**'
-      - '!llama_stack/ui/**'
+      - '!src/llama_stack/ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -91,6 +95,9 @@ jobs:
              conversations:
                table_name: openai_conversations
                backend: sql_default
              prompts:
                namespace: prompts
                backend: kv_default
          server:
            port: 8321
          EOF
--- a/.github/workflows/integration-sql-store-tests.yml
+++ b/.github/workflows/integration-sql-store-tests.yml
@ -4,11 +4,15 @@ run-name: Run the integration test suite with SqlStore
 on:
  push:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
    paths:
-      - 'llama_stack/providers/utils/sqlstore/**'
+      - 'src/llama_stack/providers/utils/sqlstore/**'
      - 'tests/integration/sqlstore/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -64,7 +68,7 @@ jobs:
      - name: Upload test logs
        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: postgres-test-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.python-version }}
          path: |
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -4,13 +4,17 @@ run-name: Run the integration test suites from tests/integration in replay mode
 on:
  push:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
    types: [opened, synchronize, reopened]
    paths:
-      - 'llama_stack/**'
+      - 'src/llama_stack/**'
-      - '!llama_stack/ui/**'
+      - '!src/llama_stack/ui/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -47,7 +51,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        client-type: [library, docker]
+        client-type: [library, docker, server]
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -4,12 +4,16 @@ run-name: Run the integration test suite with various VectorIO providers
 on:
  push:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
    paths:
-      - 'llama_stack/**'
+      - 'src/llama_stack/**'
-      - '!llama_stack/ui/**'
+      - '!src/llama_stack/ui/**'
      - 'tests/integration/vector_io/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -209,7 +213,7 @@ jobs:
      - name: Upload all logs to artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: vector-io-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ env.SANITIZED_PROVIDER }}-${{ matrix.python-version }}
          path: |
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -5,7 +5,9 @@ run-name: Run pre-commit checks
 on:
  pull_request:
  push:
-    branches: [main]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -41,25 +43,43 @@ jobs:
        with:
          node-version: '20'
          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/'
+          cache-dependency-path: 'src/llama_stack/ui/'
      - name: Set up uv
        uses: astral-sh/setup-uv@2ddd2b9cb38ad8efd50337e8ab201519a34c9f24 # v7.1.1
      - name: Install npm dependencies
        run: npm ci
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack/ui
      - name: Install pre-commit
        run: python -m pip install pre-commit
      - name: Cache pre-commit
        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4
        with:
          path: ~/.cache/pre-commit
          key: pre-commit-3|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
      - name: Run pre-commit
        id: precommit
-        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+        run: |
-        continue-on-error: true
+          set +e
          pre-commit run --show-diff-on-failure --color=always --all-files 2>&1 | tee /tmp/precommit.log
          status=${PIPESTATUS[0]}
          echo "status=$status" >> $GITHUB_OUTPUT
          exit 0
        env:
-          SKIP: no-commit-to-branch
+          SKIP: no-commit-to-branch,mypy
          RUFF_OUTPUT_FORMAT: github
      - name: Check pre-commit results
-        if: steps.precommit.outcome == 'failure'
+        if: steps.precommit.outputs.status != '0'
        run: |
          echo "::error::Pre-commit hooks failed. Please run 'pre-commit run --all-files' locally and commit the fixes."
-          echo "::warning::Some pre-commit hooks failed. Check the output above for details."
+          echo ""
          echo "Failed hooks output:"
          cat /tmp/precommit.log
          exit 1
      - name: Debug
@ -109,3 +129,30 @@ jobs:
            echo "$unstaged_files"
            exit 1
          fi
      - name: Configure client installation
        id: client-config
        uses: ./.github/actions/install-llama-stack-client
      - name: Sync dev + type_checking dependencies
        env:
          UV_INDEX_URL: ${{ steps.client-config.outputs.uv-index-url }}
          UV_EXTRA_INDEX_URL: ${{ steps.client-config.outputs.uv-extra-index-url }}
        run: |
          uv sync --group dev --group type_checking
          # Install specific client version after sync if needed
          if [ "${{ steps.client-config.outputs.install-after-sync }}" = "true" ]; then
            echo "Installing llama-stack-client from: ${{ steps.client-config.outputs.install-source }}"
            uv pip install ${{ steps.client-config.outputs.install-source }}
          fi
      - name: Run mypy (full type_checking)
        run: |
          set +e
          uv run --group dev --group type_checking mypy
          status=$?
          if [ $status -ne 0 ]; then
            echo "::error::Full mypy failed. Reproduce locally with 'uv run pre-commit run mypy-full --hook-stage manual --all-files'."
          fi
          exit $status
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -145,12 +145,12 @@ jobs:
        with:
          node-version: '20'
          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/'
+          cache-dependency-path: 'src/llama_stack/ui/'
      - name: Install npm dependencies
        if: steps.check_author.outputs.authorized == 'true'
        run: npm ci
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack/ui
      - name: Run pre-commit
        if: steps.check_author.outputs.authorized == 'true'
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -7,24 +7,24 @@ on:
    branches:
      - main
    paths:
-      - 'llama_stack/cli/stack/build.py'
+      - 'src/llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
+      - 'src/llama_stack/cli/stack/_build.py'
-      - 'llama_stack/core/build.*'
+      - 'src/llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
+      - 'src/llama_stack/core/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/distributions/**'
+      - 'src/llama_stack/distributions/**'
      - 'pyproject.toml'
      - 'containers/Containerfile'
      - '.dockerignore'
  pull_request:
    paths:
-      - 'llama_stack/cli/stack/build.py'
+      - 'src/llama_stack/cli/stack/build.py'
-      - 'llama_stack/cli/stack/_build.py'
+      - 'src/llama_stack/cli/stack/_build.py'
-      - 'llama_stack/core/build.*'
+      - 'src/llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
+      - 'src/llama_stack/core/*.sh'
      - '.github/workflows/providers-build.yml'
-      - 'llama_stack/distributions/**'
+      - 'src/llama_stack/distributions/**'
      - 'pyproject.toml'
      - 'containers/Containerfile'
      - '.dockerignore'
@ -45,7 +45,7 @@ jobs:
      - name: Generate Distribution List
        id: set-matrix
        run: |
-          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          distros=$(ls src/llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "distros=$distros" >> "$GITHUB_OUTPUT"
  build:
@ -107,13 +107,13 @@ jobs:
      - name: Build container image
        run: |
-          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "python:3.12-slim"' llama_stack/distributions/ci-tests/build.yaml)
+          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "python:3.12-slim"' src/llama_stack/distributions/ci-tests/build.yaml)
          docker build . \
            -f containers/Containerfile \
            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=ci-tests \
            --build-arg BASE_IMAGE="$BASE_IMAGE" \
-            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
+            --build-arg RUN_CONFIG_PATH=/workspace/src/llama_stack/distributions/ci-tests/run.yaml \
            -t llama-stack:ci-tests
      - name: Inspect the container image entrypoint
@ -143,17 +143,17 @@ jobs:
        run: |
          yq -i '
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
-          ' llama_stack/distributions/ci-tests/build.yaml
+          ' src/llama_stack/distributions/ci-tests/build.yaml
      - name: Build UBI9 container image
        run: |
-          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "registry.access.redhat.com/ubi9:latest"' llama_stack/distributions/ci-tests/build.yaml)
+          BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "registry.access.redhat.com/ubi9:latest"' src/llama_stack/distributions/ci-tests/build.yaml)
          docker build . \
            -f containers/Containerfile \
            --build-arg INSTALL_MODE=editable \
            --build-arg DISTRO_NAME=ci-tests \
            --build-arg BASE_IMAGE="$BASE_IMAGE" \
-            --build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
+            --build-arg RUN_CONFIG_PATH=/workspace/src/llama_stack/distributions/ci-tests/run.yaml \
            -t llama-stack:ci-tests-ubi9
      - name: Inspect UBI9 image
--- a/.github/workflows/providers-list-deps.yml
+++ b/.github/workflows/providers-list-deps.yml
@ -7,22 +7,22 @@ on:
    branches:
      - main
    paths:
-      - 'llama_stack/cli/stack/list_deps.py'
+      - 'src/llama_stack/cli/stack/list_deps.py'
-      - 'llama_stack/cli/stack/_list_deps.py'
+      - 'src/llama_stack/cli/stack/_list_deps.py'
-      - 'llama_stack/core/build.*'
+      - 'src/llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
+      - 'src/llama_stack/core/*.sh'
      - '.github/workflows/providers-list-deps.yml'
-      - 'llama_stack/templates/**'
+      - 'src/llama_stack/templates/**'
      - 'pyproject.toml'
  pull_request:
    paths:
-      - 'llama_stack/cli/stack/list_deps.py'
+      - 'src/llama_stack/cli/stack/list_deps.py'
-      - 'llama_stack/cli/stack/_list_deps.py'
+      - 'src/llama_stack/cli/stack/_list_deps.py'
-      - 'llama_stack/core/build.*'
+      - 'src/llama_stack/core/build.*'
-      - 'llama_stack/core/*.sh'
+      - 'src/llama_stack/core/*.sh'
      - '.github/workflows/providers-list-deps.yml'
-      - 'llama_stack/templates/**'
+      - 'src/llama_stack/templates/**'
      - 'pyproject.toml'
 concurrency:
@ -41,7 +41,7 @@ jobs:
      - name: Generate Distribution List
        id: set-matrix
        run: |
-          distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
+          distros=$(ls src/llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "distros=$distros" >> "$GITHUB_OUTPUT"
  list-deps:
@ -102,4 +102,4 @@ jobs:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
-          uv run llama stack list-deps llama_stack/distributions/ci-tests/build.yaml
+          uv run llama stack list-deps src/llama_stack/distributions/ci-tests/build.yaml
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -10,7 +10,7 @@ on:
    branches:
      - main
    paths-ignore:
-        - 'llama_stack/ui/**'
+        - 'src/llama_stack/ui/**'
 jobs:
  build:
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
    - name: Install uv
-      uses: astral-sh/setup-uv@3259c6206f993105e3a61b142c2d97bf4b9ef83d # v7.1.0
+      uses: astral-sh/setup-uv@2ddd2b9cb38ad8efd50337e8ab201519a34c9f24 # v7.1.1
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/.github/workflows/test-external-provider-module.yml
+++ b/.github/workflows/test-external-provider-module.yml
@ -8,7 +8,7 @@ on:
  pull_request:
    branches: [ main ]
    paths:
-      - 'llama_stack/**'
+      - 'src/llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -78,7 +78,7 @@ jobs:
      - name: Upload all logs to artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-provider-module-test
          path: |
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -8,8 +8,8 @@ on:
  pull_request:
    branches: [ main ]
    paths:
-      - 'llama_stack/**'
+      - 'src/llama_stack/**'
-      - '!llama_stack/ui/**'
+      - '!src/llama_stack/ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -84,7 +84,7 @@ jobs:
      - name: Upload all logs to artifacts
        if: ${{ always() }}
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-external-test
          path: |
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -8,7 +8,7 @@ on:
  pull_request:
    branches: [ main ]
    paths:
-      - 'llama_stack/ui/**'
+      - 'src/llama_stack/ui/**'
      - '.github/workflows/ui-unit-tests.yml' # This workflow
  workflow_dispatch:
@ -33,22 +33,22 @@ jobs:
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
-          cache-dependency-path: 'llama_stack/ui/package-lock.json'
+          cache-dependency-path: 'src/llama_stack/ui/package-lock.json'
      - name: Install dependencies
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack/ui
        run: npm ci
      - name: Run linting
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack/ui
        run: npm run lint
      - name: Run format check
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack/ui
        run: npm run format:check
      - name: Run unit tests
-        working-directory: llama_stack/ui
+        working-directory: src/llama_stack/ui
        env:
          CI: true
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -4,12 +4,16 @@ run-name: Run the unit test suite
 on:
  push:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
  pull_request:
-    branches: [ main ]
+    branches:
      - main
      - 'release-[0-9]+.[0-9]+.x'
    paths:
-      - 'llama_stack/**'
+      - 'src/llama_stack/**'
-      - '!llama_stack/ui/**'
+      - '!src/llama_stack/ui/**'
      - 'tests/unit/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -45,7 +49,7 @@ jobs:
      - name: Upload test results
        if: always()
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: test-results-${{ matrix.python }}
          path: |
--- a/.gitignore
+++ b/.gitignore
@ -32,3 +32,6 @@ CLAUDE.md
 docs/.docusaurus/
 docs/node_modules/
 docs/static/imported-files/
 docs/docs/api-deprecated/
 docs/docs/api-experimental/
 docs/docs/api/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -42,7 +42,7 @@ repos:
    hooks:
    -   id: ruff
        args: [ --fix ]
-        exclude: ^llama_stack/strong_typing/.*$
+        exclude: ^src/llama_stack/strong_typing/.*$
    -   id: ruff-format
 -   repo: https://github.com/adamchainz/blacken-docs
@ -58,18 +58,27 @@ repos:
    -   id: uv-lock
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.16.1
+    rev: v1.18.2
    hooks:
    -   id: mypy
        additional_dependencies:
          - uv==0.6.2
          - mypy
          - pytest
          - rich
          - types-requests
          - pydantic
          - httpx
        pass_filenames: false
 -   repo: local
    hooks:
    -   id: mypy-full
        name: mypy (full type_checking)
        entry: uv run --group dev --group type_checking mypy
        language: system
        pass_filenames: false
        stages: [manual]
 # - repo: https://github.com/tcort/markdown-link-check
 #   rev: v3.11.2
 #   hooks:
@ -86,7 +95,7 @@ repos:
        language: python
        pass_filenames: false
        require_serial: true
-        files: ^llama_stack/distributions/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
+        files: ^src/llama_stack/distributions/.*$|^src/llama_stack/providers/.*/inference/.*/models\.py$
      - id: provider-codegen
        name: Provider Codegen
        additional_dependencies:
@ -95,7 +104,7 @@ repos:
        language: python
        pass_filenames: false
        require_serial: true
-        files: ^llama_stack/providers/.*$
+        files: ^src/llama_stack/providers/.*$
      - id: openapi-codegen
        name: API Spec Codegen
        additional_dependencies:
@ -104,7 +113,7 @@ repos:
        language: python
        pass_filenames: false
        require_serial: true
-        files: ^llama_stack/apis/|^docs/openapi_generator/
+        files: ^src/llama_stack/apis/|^docs/openapi_generator/
      - id: check-workflows-use-hashes
        name: Check GitHub Actions use SHA-pinned actions
        entry: ./scripts/check-workflows-use-hashes.sh
@ -120,7 +129,7 @@ repos:
        pass_filenames: false
        require_serial: true
        always_run: true
-        files: ^llama_stack/.*$
+        files: ^src/llama_stack/.*$
      - id: forbid-pytest-asyncio
        name: Block @pytest.mark.asyncio and @pytest_asyncio.fixture
        entry: bash
@ -150,10 +159,9 @@ repos:
        name: Format & Lint UI
        entry: bash ./scripts/run-ui-linter.sh
        language: system
-        files: ^llama_stack/ui/.*\.(ts|tsx)$
+        files: ^src/llama_stack/ui/.*\.(ts|tsx)$
        pass_filenames: false
        require_serial: true
      - id: check-log-usage
        name: Ensure 'llama_stack.log' usage for logging
        entry: bash
@ -172,7 +180,23 @@ repos:
              exit 1
            fi
            exit 0
-
+      - id: fips-compliance
        name: Ensure llama-stack remains FIPS compliant
        entry: bash
        language: system
        types: [python]
        pass_filenames: true
        exclude: '^tests/.*$'  # Exclude test dir as some safety tests used MD5
        args:
          - -c
          - |
            grep -EnH '^[^#]*\b(md5|sha1|uuid3|uuid5)\b' "$@" && {
              echo;
              echo "❌ Do not use any of the following functions: hashlib.md5, hashlib.sha1, uuid.uuid3, uuid.uuid5"
              echo "   These functions are not FIPS-compliant"
              echo;
              exit 1;
            } || true
 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
    autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -61,6 +61,18 @@ uv run pre-commit run --all-files -v
 The `-v` (verbose) parameter is optional but often helpful for getting more information about any issues with that the pre-commit checks identify.
 To run the expanded mypy configuration that CI enforces, use:
 ```bash
 uv run pre-commit run mypy-full --hook-stage manual --all-files
 ```
 or invoke mypy directly with all optional dependencies:
 ```bash
 uv run --group dev --group type_checking mypy
 ```
 ```{caution}
 Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 ```
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,11 +1,11 @@
 include pyproject.toml
-include llama_stack/models/llama/llama3/tokenizer.model
+include src/llama_stack/models/llama/llama3/tokenizer.model
-include llama_stack/models/llama/llama4/tokenizer.model
+include src/llama_stack/models/llama/llama4/tokenizer.model
-include llama_stack/core/*.sh
+include src/llama_stack/core/*.sh
-include llama_stack/cli/scripts/*.sh
+include src/llama_stack/cli/scripts/*.sh
-include llama_stack/distributions/*/*.yaml
+include src/llama_stack/distributions/*/*.yaml
-exclude llama_stack/distributions/ci-tests
+exclude src/llama_stack/distributions/ci-tests
 include tests/integration/test_cases/inference/*.json
-include llama_stack/models/llama/*/*.md
+include src/llama_stack/models/llama/*/*.md
-include llama_stack/tests/integration/*.jpg
+include src/llama_stack/tests/integration/*.jpg
-prune llama_stack/distributions/ci-tests
+prune src/llama_stack/distributions/ci-tests
--- a/benchmarking/k8s-benchmark/stack-configmap.yaml
+++ b/benchmarking/k8s-benchmark/stack-configmap.yaml
@ -44,14 +44,6 @@ data:
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      files:
      - provider_id: meta-reference-files
        provider_type: inline::localfs
        config:
          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
          metadata_store:
            type: sqlite
            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
@ -115,13 +107,21 @@ data:
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      references:
+      stores:
        metadata:
          backend: kv_default
          namespace: registry
        inference:
          backend: sql_default
          table_name: inference_store
          max_write_queue_size: 10000
          num_writers: 4
        conversations:
          backend: sql_default
          table_name: openai_conversations
        prompts:
          backend: kv_default
          namespace: prompts
    models:
    - metadata:
        embedding_dimension: 768
--- a/benchmarking/k8s-benchmark/stack_run_config.yaml
+++ b/benchmarking/k8s-benchmark/stack_run_config.yaml
@ -36,14 +36,6 @@ providers:
      persistence:
        namespace: vector_io::chroma_remote
        backend: kv_default
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -108,6 +100,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
    prompts:
      namespace: prompts
      backend: kv_default
 registered_resources:
  models:
  - metadata:
--- a/client-sdks/stainless/openapi.stainless.yml
+++ b/client-sdks/stainless/openapi.stainless.yml
@ -1,610 +0,0 @@
 # yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
 organization:
  # Name of your organization or company, used to determine the name of the client
  # and headings.
  name: llama-stack-client
  docs: https://llama-stack.readthedocs.io/en/latest/
  contact: llamastack@meta.com
 security:
  - {}
  - BearerAuth: []
 security_schemes:
  BearerAuth:
    type: http
    scheme: bearer
 # `targets` define the output targets and their customization options, such as
 # whether to emit the Node SDK and what it's package name should be.
 targets:
  node:
    package_name: llama-stack-client
    production_repo: llamastack/llama-stack-client-typescript
    publish:
      npm: false
  python:
    package_name: llama_stack_client
    production_repo: llamastack/llama-stack-client-python
    options:
      use_uv: true
    publish:
      pypi: true
    project_name: llama_stack_client
  kotlin:
    reverse_domain: com.llama_stack_client.api
    production_repo: null
    publish:
      maven: false
  go:
    package_name: llama-stack-client
    production_repo: llamastack/llama-stack-client-go
    options:
      enable_v2: true
      back_compat_use_shared_package: false
 # `client_settings` define settings for the API client, such as extra constructor
 # arguments (used for authentication), retry behavior, idempotency, etc.
 client_settings:
  default_env_prefix: LLAMA_STACK_CLIENT
  opts:
    api_key:
      type: string
      read_env: LLAMA_STACK_CLIENT_API_KEY
      auth: { security_scheme: BearerAuth }
      nullable: true
 # `environments` are a map of the name of the environment (e.g. "sandbox",
 # "production") to the corresponding url to use.
 environments:
  production: http://any-hosted-llama-stack.com
 # `pagination` defines [pagination schemes] which provides a template to match
 # endpoints and generate next-page and auto-pagination helpers in the SDKs.
 pagination:
  - name: datasets_iterrows
    type: offset
    request:
      dataset_id:
        type: string
      start_index:
        type: integer
        x-stainless-pagination-property:
          purpose: offset_count_param
      limit:
        type: integer
    response:
      data:
        type: array
        items:
          type: object
      next_index:
        type: integer
        x-stainless-pagination-property:
          purpose: offset_count_start_field
  - name: openai_cursor_page
    type: cursor
    request:
      limit:
        type: integer
      after:
        type: string
        x-stainless-pagination-property:
          purpose: next_cursor_param
    response:
      data:
        type: array
        items: {}
      has_more:
        type: boolean
      last_id:
        type: string
        x-stainless-pagination-property:
          purpose: next_cursor_field
 # `resources` define the structure and organziation for your API, such as how
 # methods and models are grouped together and accessed. See the [configuration
 # guide] for more information.
 #
 # [configuration guide]:
 #   https://app.stainlessapi.com/docs/guides/configure#resources
 resources:
  $shared:
    models:
      agent_config: AgentConfig
      interleaved_content_item: InterleavedContentItem
      interleaved_content: InterleavedContent
      param_type: ParamType
      safety_violation: SafetyViolation
      sampling_params: SamplingParams
      scoring_result: ScoringResult
      message: Message
      user_message: UserMessage
      completion_message: CompletionMessage
      tool_response_message: ToolResponseMessage
      system_message: SystemMessage
      tool_call: ToolCall
      query_result: RAGQueryResult
      document: RAGDocument
      query_config: RAGQueryConfig
      response_format: ResponseFormat
  toolgroups:
    models:
      tool_group: ToolGroup
      list_tool_groups_response: ListToolGroupsResponse
    methods:
      register: post /v1/toolgroups
      get: get /v1/toolgroups/{toolgroup_id}
      list: get /v1/toolgroups
      unregister: delete /v1/toolgroups/{toolgroup_id}
  tools:
    methods:
      get: get /v1/tools/{tool_name}
      list:
        endpoint: get /v1/tools
        paginated: false
  tool_runtime:
    models:
      tool_def: ToolDef
      tool_invocation_result: ToolInvocationResult
    methods:
      list_tools:
        endpoint: get /v1/tool-runtime/list-tools
        paginated: false
      invoke_tool: post /v1/tool-runtime/invoke
    subresources:
      rag_tool:
        methods:
          insert: post /v1/tool-runtime/rag-tool/insert
          query: post /v1/tool-runtime/rag-tool/query
  responses:
    models:
      response_object_stream: OpenAIResponseObjectStream
      response_object: OpenAIResponseObject
    methods:
      create:
        type: http
        endpoint: post /v1/responses
        streaming:
          stream_event_model: responses.response_object_stream
          param_discriminator: stream
      retrieve: get /v1/responses/{response_id}
      list:
        type: http
        endpoint: get /v1/responses
      delete:
        type: http
        endpoint: delete /v1/responses/{response_id}
    subresources:
      input_items:
        methods:
          list:
            type: http
            endpoint: get /v1/responses/{response_id}/input_items
  conversations:
    models:
      conversation_object: Conversation
    methods:
      create:
        type: http
        endpoint: post /v1/conversations
      retrieve: get /v1/conversations/{conversation_id}
      update:
        type: http
        endpoint: post /v1/conversations/{conversation_id}
      delete:
        type: http
        endpoint: delete /v1/conversations/{conversation_id}
    subresources:
      items:
        methods:
          get:
            type: http
            endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
          list:
            type: http
            endpoint: get /v1/conversations/{conversation_id}/items
          create:
            type: http
            endpoint: post /v1/conversations/{conversation_id}/items
  inspect:
    models:
      healthInfo: HealthInfo
      providerInfo: ProviderInfo
      routeInfo: RouteInfo
      versionInfo: VersionInfo
    methods:
      health: get /v1/health
      version: get /v1/version
  embeddings:
    models:
      create_embeddings_response: OpenAIEmbeddingsResponse
    methods:
      create: post /v1/embeddings
  chat:
    models:
      chat_completion_chunk: OpenAIChatCompletionChunk
    subresources:
      completions:
        methods:
          create:
            type: http
            endpoint: post /v1/chat/completions
            streaming:
              stream_event_model: chat.chat_completion_chunk
              param_discriminator: stream
          list:
            type: http
            endpoint: get /v1/chat/completions
          retrieve:
            type: http
            endpoint: get /v1/chat/completions/{completion_id}
  completions:
    methods:
      create:
        type: http
        endpoint: post /v1/completions
        streaming:
          param_discriminator: stream
  vector_io:
    models:
      queryChunksResponse: QueryChunksResponse
    methods:
      insert: post /v1/vector-io/insert
      query: post /v1/vector-io/query
  vector_stores:
    models:
      vector_store: VectorStoreObject
      list_vector_stores_response: VectorStoreListResponse
      vector_store_delete_response: VectorStoreDeleteResponse
      vector_store_search_response: VectorStoreSearchResponsePage
    methods:
      create: post /v1/vector_stores
      list:
        endpoint: get /v1/vector_stores
      retrieve: get /v1/vector_stores/{vector_store_id}
      update: post /v1/vector_stores/{vector_store_id}
      delete: delete /v1/vector_stores/{vector_store_id}
      search: post /v1/vector_stores/{vector_store_id}/search
    subresources:
      files:
        models:
          vector_store_file: VectorStoreFileObject
        methods:
          list: get /v1/vector_stores/{vector_store_id}/files
          retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
          update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
          delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
          create: post /v1/vector_stores/{vector_store_id}/files
          content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
      file_batches:
        models:
          vector_store_file_batches: VectorStoreFileBatchObject
          list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
        methods:
          create: post /v1/vector_stores/{vector_store_id}/file_batches
          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
  models:
    models:
      model: Model
      list_models_response: ListModelsResponse
    methods:
      retrieve: get /v1/models/{model_id}
      list:
        endpoint: get /v1/models
        paginated: false
      register: post /v1/models
      unregister: delete /v1/models/{model_id}
    subresources:
      openai:
        methods:
          list:
            endpoint: get /v1/models
            paginated: false
  providers:
    models:
      list_providers_response: ListProvidersResponse
    methods:
      list:
        endpoint: get /v1/providers
        paginated: false
      retrieve: get /v1/providers/{provider_id}
  routes:
    models:
      list_routes_response: ListRoutesResponse
    methods:
      list:
        endpoint: get /v1/inspect/routes
        paginated: false
  moderations:
    models:
      create_response: ModerationObject
    methods:
      create: post /v1/moderations
  safety:
    models:
      run_shield_response: RunShieldResponse
    methods:
      run_shield: post /v1/safety/run-shield
  shields:
    models:
      shield: Shield
      list_shields_response: ListShieldsResponse
    methods:
      retrieve: get /v1/shields/{identifier}
      list:
        endpoint: get /v1/shields
        paginated: false
      register: post /v1/shields
      delete: delete /v1/shields/{identifier}
  synthetic_data_generation:
    models:
      syntheticDataGenerationResponse: SyntheticDataGenerationResponse
    methods:
      generate: post /v1/synthetic-data-generation/generate
  telemetry:
    models:
      span_with_status: SpanWithStatus
      trace: Trace
      query_spans_response: QuerySpansResponse
      event: Event
      query_condition: QueryCondition
    methods:
      query_traces:
        endpoint: post /v1alpha/telemetry/traces
        skip_test_reason: 'unsupported query params in java / kotlin'
      get_span_tree: post /v1alpha/telemetry/spans/{span_id}/tree
      query_spans:
        endpoint: post /v1alpha/telemetry/spans
        skip_test_reason: 'unsupported query params in java / kotlin'
      query_metrics:
        endpoint: post /v1alpha/telemetry/metrics/{metric_name}
        skip_test_reason: 'unsupported query params in java / kotlin'
      # log_event: post /v1alpha/telemetry/events
      save_spans_to_dataset: post /v1alpha/telemetry/spans/export
      get_span: get /v1alpha/telemetry/traces/{trace_id}/spans/{span_id}
      get_trace: get /v1alpha/telemetry/traces/{trace_id}
  scoring:
    methods:
      score: post /v1/scoring/score
      score_batch: post /v1/scoring/score-batch
  scoring_functions:
    methods:
      retrieve: get /v1/scoring-functions/{scoring_fn_id}
      list:
        endpoint: get /v1/scoring-functions
        paginated: false
      register: post /v1/scoring-functions
    models:
      scoring_fn: ScoringFn
      scoring_fn_params: ScoringFnParams
      list_scoring_functions_response: ListScoringFunctionsResponse
  benchmarks:
    methods:
      retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
      list:
        endpoint: get /v1alpha/eval/benchmarks
        paginated: false
      register: post /v1alpha/eval/benchmarks
    models:
      benchmark: Benchmark
      list_benchmarks_response: ListBenchmarksResponse
  files:
    methods:
      create: post /v1/files
      list: get /v1/files
      retrieve: get /v1/files/{file_id}
      delete: delete /v1/files/{file_id}
      content: get /v1/files/{file_id}/content
    models:
      file: OpenAIFileObject
      list_files_response: ListOpenAIFileResponse
      delete_file_response: OpenAIFileDeleteResponse
  alpha:
    subresources:
      inference:
        methods:
          rerank: post /v1alpha/inference/rerank
      post_training:
        models:
          algorithm_config: AlgorithmConfig
          post_training_job: PostTrainingJob
          list_post_training_jobs_response: ListPostTrainingJobsResponse
        methods:
          preference_optimize: post /v1alpha/post-training/preference-optimize
          supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
        subresources:
          job:
            methods:
              artifacts: get /v1alpha/post-training/job/artifacts
              cancel: post /v1alpha/post-training/job/cancel
              status: get /v1alpha/post-training/job/status
              list:
                endpoint: get /v1alpha/post-training/jobs
                paginated: false
      eval:
        methods:
          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
        subresources:
          jobs:
            methods:
              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
        models:
          evaluate_response: EvaluateResponse
          benchmark_config: BenchmarkConfig
          job: Job
      agents:
        methods:
          create: post /v1alpha/agents
          list: get /v1alpha/agents
          retrieve: get /v1alpha/agents/{agent_id}
          delete: delete /v1alpha/agents/{agent_id}
        models:
          inference_step: InferenceStep
          tool_execution_step: ToolExecutionStep
          tool_response: ToolResponse
          shield_call_step: ShieldCallStep
          memory_retrieval_step: MemoryRetrievalStep
        subresources:
          session:
            models:
              session: Session
            methods:
              list: get /v1alpha/agents/{agent_id}/sessions
              create: post /v1alpha/agents/{agent_id}/session
              delete: delete /v1alpha/agents/{agent_id}/session/{session_id}
              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}
          steps:
            methods:
              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
          turn:
            models:
              turn: Turn
              turn_response_event: AgentTurnResponseEvent
              agent_turn_response_stream_chunk: AgentTurnResponseStreamChunk
            methods:
              create:
                type: http
                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn
                streaming:
                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
                  param_discriminator: stream
              retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}
              resume:
                type: http
                endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume
                streaming:
                  stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
                  param_discriminator: stream
  beta:
    subresources:
      datasets:
        models:
          list_datasets_response: ListDatasetsResponse
        methods:
          register: post /v1beta/datasets
          retrieve: get /v1beta/datasets/{dataset_id}
          list:
            endpoint: get /v1beta/datasets
            paginated: false
          unregister: delete /v1beta/datasets/{dataset_id}
          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
 settings:
  license: MIT
  unwrap_response_fields: [ data ]
 openapi:
  transformations:
    - command: renameValue
      reason: pydantic reserved name
      args:
        filter:
          only:
            - '$.components.schemas.InferenceStep.properties.model_response'
        rename:
          python:
            property_name: 'inference_model_response'
    # - command: renameValue
    #   reason: pydantic reserved name
    #   args:
    #     filter:
    #       only:
    #         - '$.components.schemas.Model.properties.model_type'
    #     rename:
    #       python:
    #         property_name: 'type'
    - command: mergeObject
      reason: Better return_type using enum
      args:
        target:
          - '$.components.schemas'
        object:
          ReturnType:
            additionalProperties: false
            properties:
              type:
                enum:
                  - string
                  - number
                  - boolean
                  - array
                  - object
                  - json
                  - union
                  - chat_completion_input
                  - completion_input
                  - agent_turn_input
            required:
              - type
            type: object
    - command: replaceProperties
      reason: Replace return type properties with better model (see above)
      args:
        filter:
          only:
            - '$.components.schemas.ScoringFn.properties.return_type'
            - '$.components.schemas.RegisterScoringFunctionRequest.properties.return_type'
        value:
          $ref: '#/components/schemas/ReturnType'
    - command: oneOfToAnyOf
      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
    - reason: For better names
      command: extractToRefs
      args:
        ref:
          target: '$.components.schemas.ToolCallDelta.properties.tool_call'
          name: '#/components/schemas/ToolCallOrString'
 # `readme` is used to configure the code snippets that will be rendered in the
 # README.md of various SDKs. In particular, you can change the `headline`
 # snippet's endpoint and the arguments to call it with.
 readme:
  example_requests:
    default:
      type: request
      endpoint: post /v1/chat/completions
      params: &ref_0 {}
    headline:
      type: request
      endpoint: post /v1/models
      params: *ref_0
    pagination:
      type: request
      endpoint: post /v1/chat/completions
      params: {}
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -15,6 +15,141 @@ info:
 servers:
  - url: http://any-hosted-llama-stack.com
 paths:
  /v1/batches:
    get:
      responses:
        '200':
          description: A list of batch objects.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListBatchesResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: List all batches for the current user.
      description: List all batches for the current user.
      parameters:
        - name: after
          in: query
          description: >-
            A cursor for pagination; returns batches after this batch ID.
          required: false
          schema:
            type: string
        - name: limit
          in: query
          description: >-
            Number of batches to return (default 20, max 100).
          required: true
          schema:
            type: integer
      deprecated: false
    post:
      responses:
        '200':
          description: The created batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: >-
        Create a new batch for processing multiple API requests.
      description: >-
        Create a new batch for processing multiple API requests.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateBatchRequest'
        required: true
      deprecated: false
  /v1/batches/{batch_id}:
    get:
      responses:
        '200':
          description: The batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: >-
        Retrieve information about a specific batch.
      description: >-
        Retrieve information about a specific batch.
      parameters:
        - name: batch_id
          in: path
          description: The ID of the batch to retrieve.
          required: true
          schema:
            type: string
      deprecated: false
  /v1/batches/{batch_id}/cancel:
    post:
      responses:
        '200':
          description: The updated batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: Cancel a batch that is in progress.
      description: Cancel a batch that is in progress.
      parameters:
        - name: batch_id
          in: path
          description: The ID of the batch to cancel.
          required: true
          schema:
            type: string
      deprecated: false
  /v1/chat/completions:
    get:
      responses:
@ -4212,6 +4347,331 @@ components:
      title: Error
      description: >-
        Error response from the API. Roughly follows RFC 7807.
    ListBatchesResponse:
      type: object
      properties:
        object:
          type: string
          const: list
          default: list
        data:
          type: array
          items:
            type: object
            properties:
              id:
                type: string
              completion_window:
                type: string
              created_at:
                type: integer
              endpoint:
                type: string
              input_file_id:
                type: string
              object:
                type: string
                const: batch
              status:
                type: string
                enum:
                  - validating
                  - failed
                  - in_progress
                  - finalizing
                  - completed
                  - expired
                  - cancelling
                  - cancelled
              cancelled_at:
                type: integer
              cancelling_at:
                type: integer
              completed_at:
                type: integer
              error_file_id:
                type: string
              errors:
                type: object
                properties:
                  data:
                    type: array
                    items:
                      type: object
                      properties:
                        code:
                          type: string
                        line:
                          type: integer
                        message:
                          type: string
                        param:
                          type: string
                      additionalProperties: false
                      title: BatchError
                  object:
                    type: string
                additionalProperties: false
                title: Errors
              expired_at:
                type: integer
              expires_at:
                type: integer
              failed_at:
                type: integer
              finalizing_at:
                type: integer
              in_progress_at:
                type: integer
              metadata:
                type: object
                additionalProperties:
                  type: string
              model:
                type: string
              output_file_id:
                type: string
              request_counts:
                type: object
                properties:
                  completed:
                    type: integer
                  failed:
                    type: integer
                  total:
                    type: integer
                additionalProperties: false
                required:
                  - completed
                  - failed
                  - total
                title: BatchRequestCounts
              usage:
                type: object
                properties:
                  input_tokens:
                    type: integer
                  input_tokens_details:
                    type: object
                    properties:
                      cached_tokens:
                        type: integer
                    additionalProperties: false
                    required:
                      - cached_tokens
                    title: InputTokensDetails
                  output_tokens:
                    type: integer
                  output_tokens_details:
                    type: object
                    properties:
                      reasoning_tokens:
                        type: integer
                    additionalProperties: false
                    required:
                      - reasoning_tokens
                    title: OutputTokensDetails
                  total_tokens:
                    type: integer
                additionalProperties: false
                required:
                  - input_tokens
                  - input_tokens_details
                  - output_tokens
                  - output_tokens_details
                  - total_tokens
                title: BatchUsage
            additionalProperties: false
            required:
              - id
              - completion_window
              - created_at
              - endpoint
              - input_file_id
              - object
              - status
            title: Batch
        first_id:
          type: string
        last_id:
          type: string
        has_more:
          type: boolean
          default: false
      additionalProperties: false
      required:
        - object
        - data
        - has_more
      title: ListBatchesResponse
      description: >-
        Response containing a list of batch objects.
    CreateBatchRequest:
      type: object
      properties:
        input_file_id:
          type: string
          description: >-
            The ID of an uploaded file containing requests for the batch.
        endpoint:
          type: string
          description: >-
            The endpoint to be used for all requests in the batch.
        completion_window:
          type: string
          const: 24h
          description: >-
            The time window within which the batch should be processed.
        metadata:
          type: object
          additionalProperties:
            type: string
          description: Optional metadata for the batch.
        idempotency_key:
          type: string
          description: >-
            Optional idempotency key. When provided, enables idempotent behavior.
      additionalProperties: false
      required:
        - input_file_id
        - endpoint
        - completion_window
      title: CreateBatchRequest
    Batch:
      type: object
      properties:
        id:
          type: string
        completion_window:
          type: string
        created_at:
          type: integer
        endpoint:
          type: string
        input_file_id:
          type: string
        object:
          type: string
          const: batch
        status:
          type: string
          enum:
            - validating
            - failed
            - in_progress
            - finalizing
            - completed
            - expired
            - cancelling
            - cancelled
        cancelled_at:
          type: integer
        cancelling_at:
          type: integer
        completed_at:
          type: integer
        error_file_id:
          type: string
        errors:
          type: object
          properties:
            data:
              type: array
              items:
                type: object
                properties:
                  code:
                    type: string
                  line:
                    type: integer
                  message:
                    type: string
                  param:
                    type: string
                additionalProperties: false
                title: BatchError
            object:
              type: string
          additionalProperties: false
          title: Errors
        expired_at:
          type: integer
        expires_at:
          type: integer
        failed_at:
          type: integer
        finalizing_at:
          type: integer
        in_progress_at:
          type: integer
        metadata:
          type: object
          additionalProperties:
            type: string
        model:
          type: string
        output_file_id:
          type: string
        request_counts:
          type: object
          properties:
            completed:
              type: integer
            failed:
              type: integer
            total:
              type: integer
          additionalProperties: false
          required:
            - completed
            - failed
            - total
          title: BatchRequestCounts
        usage:
          type: object
          properties:
            input_tokens:
              type: integer
            input_tokens_details:
              type: object
              properties:
                cached_tokens:
                  type: integer
              additionalProperties: false
              required:
                - cached_tokens
              title: InputTokensDetails
            output_tokens:
              type: integer
            output_tokens_details:
              type: object
              properties:
                reasoning_tokens:
                  type: integer
              additionalProperties: false
              required:
                - reasoning_tokens
              title: OutputTokensDetails
            total_tokens:
              type: integer
          additionalProperties: false
          required:
            - input_tokens
            - input_tokens_details
            - output_tokens
            - output_tokens_details
            - total_tokens
          title: BatchUsage
      additionalProperties: false
      required:
        - id
        - completion_window
        - created_at
        - endpoint
        - input_file_id
        - object
        - status
      title: Batch
    Order:
      type: string
      enum:
@ -5474,11 +5934,44 @@ components:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentImage'
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentFile'
      discriminator:
        propertyName: type
        mapping:
          input_text: '#/components/schemas/OpenAIResponseInputMessageContentText'
          input_image: '#/components/schemas/OpenAIResponseInputMessageContentImage'
          input_file: '#/components/schemas/OpenAIResponseInputMessageContentFile'
    OpenAIResponseInputMessageContentFile:
      type: object
      properties:
        type:
          type: string
          const: input_file
          default: input_file
          description: >-
            The type of the input item. Always `input_file`.
        file_data:
          type: string
          description: >-
            The data of the file to be sent to the model.
        file_id:
          type: string
          description: >-
            (Optional) The ID of the file to be sent to the model.
        file_url:
          type: string
          description: >-
            The URL of the file to be sent to the model.
        filename:
          type: string
          description: >-
            The name of the file to be sent to the model.
      additionalProperties: false
      required:
        - type
      title: OpenAIResponseInputMessageContentFile
      description: >-
        File content for input messages in OpenAI response format.
    OpenAIResponseInputMessageContentImage:
      type: object
      properties:
@ -5499,6 +5992,10 @@ components:
          default: input_image
          description: >-
            Content type identifier, always "input_image"
        file_id:
          type: string
          description: >-
            (Optional) The ID of the file to be sent to the model.
        image_url:
          type: string
          description: (Optional) URL of the image content
@ -6735,14 +7232,9 @@ components:
        Error details for failed OpenAI response requests.
    OpenAIResponseInput:
      oneOf:
-        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+        - $ref: '#/components/schemas/OpenAIResponseOutput'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
        - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
        - $ref: '#/components/schemas/OpenAIResponseMessage'
    OpenAIResponseInputToolFileSearch:
      type: object
@ -6898,6 +7390,10 @@ components:
          type: string
          description: >-
            (Optional) ID of the previous response in a conversation
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Reference to a prompt template and its variables.
        status:
          type: string
          description: >-
@ -6971,6 +7467,30 @@ components:
          mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
          mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
          mcp_approval_request: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
    OpenAIResponsePrompt:
      type: object
      properties:
        id:
          type: string
          description: Unique identifier of the prompt template
        variables:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/OpenAIResponseInputMessageContent'
          description: >-
            Dictionary of variable names to OpenAIResponseInputMessageContent structure
            for template substitution. The substitution values can either be strings,
            or other Response input types like images or files.
        version:
          type: string
          description: >-
            Version number of the prompt to use (defaults to latest if not specified)
      additionalProperties: false
      required:
        - id
      title: OpenAIResponsePrompt
      description: >-
        OpenAI compatible Prompt object that is used in OpenAI responses.
    OpenAIResponseText:
      type: object
      properties:
@ -7228,6 +7748,10 @@ components:
        model:
          type: string
          description: The underlying LLM used for completions.
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Prompt object with ID, version, and variables.
        instructions:
          type: string
        previous_response_id:
@ -7305,6 +7829,10 @@ components:
          type: string
          description: >-
            (Optional) ID of the previous response in a conversation
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Reference to a prompt template and its variables.
        status:
          type: string
          description: >-
@ -9867,7 +10395,7 @@ components:
            $ref: '#/components/schemas/RAGDocument'
          description: >-
            List of documents to index in the RAG system
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            ID of the vector database to store the document embeddings
@ -9878,7 +10406,7 @@ components:
      additionalProperties: false
      required:
        - documents
-        - vector_db_id
+        - vector_store_id
        - chunk_size_in_tokens
      title: InsertRequest
    DefaultRAGQueryGeneratorConfig:
@ -10049,7 +10577,7 @@ components:
          $ref: '#/components/schemas/InterleavedContent'
          description: >-
            The query content to search for in the indexed documents
-        vector_db_ids:
+        vector_store_ids:
          type: array
          items:
            type: string
@ -10062,7 +10590,7 @@ components:
      additionalProperties: false
      required:
        - content
-        - vector_db_ids
+        - vector_store_ids
      title: QueryRequest
    RAGQueryResult:
      type: object
@ -10190,6 +10718,10 @@ components:
          description: >-
            The content of the chunk, which can be interleaved text, images, or other
            types.
        chunk_id:
          type: string
          description: >-
            Unique identifier for the chunk. Must be provided explicitly.
        metadata:
          type: object
          additionalProperties:
@ -10210,10 +10742,6 @@ components:
          description: >-
            Optional embedding for the chunk. If not provided, it will be computed
            later.
        stored_chunk_id:
          type: string
          description: >-
            The chunk ID that is stored in the vector database. Used for backend functionality.
        chunk_metadata:
          $ref: '#/components/schemas/ChunkMetadata'
          description: >-
@ -10222,6 +10750,7 @@ components:
      additionalProperties: false
      required:
        - content
        - chunk_id
        - metadata
      title: Chunk
      description: >-
@ -10286,7 +10815,7 @@ components:
    InsertChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to insert the chunks into.
@ -10305,13 +10834,13 @@ components:
          description: The time to live of the chunks.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - chunks
      title: InsertChunksRequest
    QueryChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to query.
@ -10331,7 +10860,7 @@ components:
          description: The parameters of the query.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - query
      title: QueryChunksRequest
    QueryChunksResponse:
@ -11600,7 +12129,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
@ -11850,7 +12378,7 @@ components:
          description: Type of the step in an agent turn.
          const: memory_retrieval
          default: memory_retrieval
-        vector_db_ids:
+        vector_store_ids:
          type: string
          description: >-
            The IDs of the vector databases to retrieve context from.
@ -11863,7 +12391,7 @@ components:
        - turn_id
        - step_id
        - step_type
-        - vector_db_ids
+        - vector_store_ids
        - inserted_context
      title: MemoryRetrievalStep
      description: >-
@ -13460,6 +13988,19 @@ tags:
    description: >-
      APIs for creating and interacting with agentic systems.
    x-displayName: Agents
  - name: Batches
    description: >-
      The API is designed to allow use of openai client libraries for seamless integration.
      This API provides the following extensions:
       - idempotent batch creation
      Note: This API is currently under active development and may undergo changes.
    x-displayName: >-
      The Batches API enables efficient processing of multiple requests in a single
      operation, particularly useful for processing large datasets, batch evaluation
      workflows, and cost-effective inference at scale.
  - name: Benchmarks
    description: ''
  - name: Conversations
@ -13534,6 +14075,7 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
      - Batches
      - Benchmarks
      - Conversations
      - DatasetIO
--- a/docs/docs/distributions/configuration.mdx
+++ b/docs/docs/distributions/configuration.mdx
@ -58,13 +58,21 @@ storage:
    sql_default:
      type: sql_sqlite
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/sqlstore.db
-  references:
+  stores:
    metadata:
      backend: kv_default
      namespace: registry
    inference:
      backend: sql_default
      table_name: inference_store
      max_write_queue_size: 10000
      num_writers: 4
    conversations:
      backend: sql_default
      table_name: openai_conversations
    prompts:
      backend: kv_default
      namespace: prompts
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
--- a/docs/docs/distributions/k8s/stack-configmap.yaml
+++ b/docs/docs/distributions/k8s/stack-configmap.yaml
@ -113,13 +113,21 @@ data:
          db: ${env.POSTGRES_DB:=llamastack}
          user: ${env.POSTGRES_USER:=llamastack}
          password: ${env.POSTGRES_PASSWORD:=llamastack}
-      references:
+      stores:
        metadata:
          backend: kv_default
          namespace: registry
        inference:
          backend: sql_default
          table_name: inference_store
          max_write_queue_size: 10000
          num_writers: 4
        conversations:
          backend: sql_default
          table_name: openai_conversations
        prompts:
          backend: kv_default
          namespace: prompts
    models:
    - metadata:
        embedding_dimension: 768
--- a/docs/docs/distributions/k8s/stack_run_config.yaml
+++ b/docs/docs/distributions/k8s/stack_run_config.yaml
@ -106,6 +106,9 @@ storage:
    conversations:
      table_name: openai_conversations
      backend: sql_default
    prompts:
      namespace: prompts
      backend: kv_default
 registered_resources:
  models:
  - metadata:
--- a/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/docs/distributions/self_hosted_distro/meta-reference-gpu.md
@ -79,6 +79,33 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```
 ### Via Docker with Custom Run Configuration
 You can also run the Docker container with a custom run configuration file by mounting it into the container:
 ```bash
 # Set the path to your custom run.yaml file
 CUSTOM_RUN_CONFIG=/path/to/your/custom-run.yaml
 LLAMA_STACK_PORT=8321
 docker run \
  -it \
  --pull always \
  --gpu all \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v $CUSTOM_RUN_CONFIG:/app/custom-run.yaml \
  -e RUN_CONFIG_PATH=/app/custom-run.yaml \
  llamastack/distribution-meta-reference-gpu \
  --port $LLAMA_STACK_PORT
 ```
 **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
 Available run configurations for this distribution:
 - `run.yaml`
 - `run-with-safety.yaml`
 ### Via venv
 Make sure you have the Llama Stack CLI available.
--- a/docs/docs/distributions/self_hosted_distro/nvidia.md
+++ b/docs/docs/distributions/self_hosted_distro/nvidia.md
@ -127,13 +127,39 @@ docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-  -v ./run.yaml:/root/my-run.yaml \
+  -v ~/.llama:/root/.llama \
  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
  llamastack/distribution-nvidia \
  --config /root/my-run.yaml \
  --port $LLAMA_STACK_PORT
 ```
 ### Via Docker with Custom Run Configuration
 You can also run the Docker container with a custom run configuration file by mounting it into the container:
 ```bash
 # Set the path to your custom run.yaml file
 CUSTOM_RUN_CONFIG=/path/to/your/custom-run.yaml
 LLAMA_STACK_PORT=8321
 docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v $CUSTOM_RUN_CONFIG:/app/custom-run.yaml \
  -e RUN_CONFIG_PATH=/app/custom-run.yaml \
  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
  llamastack/distribution-nvidia \
  --port $LLAMA_STACK_PORT
 ```
 **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
 Available run configurations for this distribution:
 - `run.yaml`
 - `run-with-safety.yaml`
 ### Via venv
 If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
--- a/docs/docs/providers/files/remote_openai.mdx
+++ b/docs/docs/providers/files/remote_openai.mdx
@ -0,0 +1,27 @@
 ---
 description: "OpenAI Files API provider for managing files through OpenAI's native file storage service."
 sidebar_label: Remote - Openai
 title: remote::openai
 ---
 # remote::openai
 ## Description
 OpenAI Files API provider for managing files through OpenAI's native file storage service.
 ## Configuration
 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
 | `api_key` | `<class 'str'>` | No |  | OpenAI API key for authentication |
 | `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No |  | SQL store configuration for file metadata |
 ## Sample Configuration
 ```yaml
 api_key: ${env.OPENAI_API_KEY}
 metadata_store:
  table_name: openai_files_metadata
  backend: sql_default
 ```
--- a/docs/docs/providers/inference/remote_nvidia.mdx
+++ b/docs/docs/providers/inference/remote_nvidia.mdx
@ -20,6 +20,7 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.
 | `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
 | `append_api_version` | `<class 'bool'>` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. |
 | `rerank_model_to_url` | `dict[str, str` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints.  |
 ## Sample Configuration
--- a/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
+++ b/docs/docs/providers/vector_io/inline_sqlite-vec.mdx
@ -72,14 +72,14 @@ description: |
  Example with hybrid search:
  ```python
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
  )
  # Using RRF ranker
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={
          "mode": "hybrid",
@ -91,7 +91,7 @@ description: |
  # Using weighted ranker
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={
          "mode": "hybrid",
@ -105,7 +105,7 @@ description: |
  Example with explicit vector search:
  ```python
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
  )
@ -114,7 +114,7 @@ description: |
  Example with keyword search:
  ```python
  response = await vector_io.query_chunks(
-      vector_db_id="my_db",
+      vector_store_id="my_db",
      query="your query here",
      params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
  )
@ -277,14 +277,14 @@ The SQLite-vec provider supports three search modes:
 Example with hybrid search:
 ```python
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={"mode": "hybrid", "max_chunks": 3, "score_threshold": 0.7},
 )
 # Using RRF ranker
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={
        "mode": "hybrid",
@ -296,7 +296,7 @@ response = await vector_io.query_chunks(
 # Using weighted ranker
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={
        "mode": "hybrid",
@ -310,7 +310,7 @@ response = await vector_io.query_chunks(
 Example with explicit vector search:
 ```python
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={"mode": "vector", "max_chunks": 3, "score_threshold": 0.7},
 )
@ -319,7 +319,7 @@ response = await vector_io.query_chunks(
 Example with keyword search:
 ```python
 response = await vector_io.query_chunks(
-    vector_db_id="my_db",
+    vector_store_id="my_db",
    query="your query here",
    params={"mode": "keyword", "max_chunks": 3, "score_threshold": 0.7},
 )
--- a/docs/notebooks/llamastack_agents_getting_started_examples.ipynb
+++ b/docs/notebooks/llamastack_agents_getting_started_examples.ipynb
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@ -242,15 +242,6 @@ const sidebars: SidebarsConfig = {
            'providers/eval/remote_nvidia'
          ],
        },
        {
          type: 'category',
          label: 'Telemetry',
          collapsed: true,
          items: [
            'providers/telemetry/index',
            'providers/telemetry/inline_meta-reference'
          ],
        },
        {
          type: 'category',
          label: 'Batches',
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
@ -1414,6 +1414,193 @@
                "deprecated": true
            }
        },
        "/v1/openai/v1/batches": {
            "get": {
                "responses": {
                    "200": {
                        "description": "A list of batch objects.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ListBatchesResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "List all batches for the current user.",
                "description": "List all batches for the current user.",
                "parameters": [
                    {
                        "name": "after",
                        "in": "query",
                        "description": "A cursor for pagination; returns batches after this batch ID.",
                        "required": false,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "limit",
                        "in": "query",
                        "description": "Number of batches to return (default 20, max 100).",
                        "required": true,
                        "schema": {
                            "type": "integer"
                        }
                    }
                ],
                "deprecated": true
            },
            "post": {
                "responses": {
                    "200": {
                        "description": "The created batch object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Batch"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "Create a new batch for processing multiple API requests.",
                "description": "Create a new batch for processing multiple API requests.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/CreateBatchRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": true
            }
        },
        "/v1/openai/v1/batches/{batch_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "The batch object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Batch"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "Retrieve information about a specific batch.",
                "description": "Retrieve information about a specific batch.",
                "parameters": [
                    {
                        "name": "batch_id",
                        "in": "path",
                        "description": "The ID of the batch to retrieve.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": true
            }
        },
        "/v1/openai/v1/batches/{batch_id}/cancel": {
            "post": {
                "responses": {
                    "200": {
                        "description": "The updated batch object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Batch"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "Cancel a batch that is in progress.",
                "description": "Cancel a batch that is in progress.",
                "parameters": [
                    {
                        "name": "batch_id",
                        "in": "path",
                        "description": "The ID of the batch to cancel.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": true
            }
        },
        "/v1/openai/v1/chat/completions": {
            "get": {
                "responses": {
@ -3901,7 +4088,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
@ -4391,7 +4577,7 @@
                        "const": "memory_retrieval",
                        "default": "memory_retrieval"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "string",
                        "description": "The IDs of the vector databases to retrieve context from."
                    },
@ -4405,7 +4591,7 @@
                    "turn_id",
                    "step_id",
                    "step_type",
-                    "vector_db_ids",
+                    "vector_store_ids",
                    "inserted_context"
                ],
                "title": "MemoryRetrievalStep",
@ -6402,6 +6588,451 @@
                "title": "Job",
                "description": "A job execution instance with status tracking."
            },
            "ListBatchesResponse": {
                "type": "object",
                "properties": {
                    "object": {
                        "type": "string",
                        "const": "list",
                        "default": "list"
                    },
                    "data": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "id": {
                                    "type": "string"
                                },
                                "completion_window": {
                                    "type": "string"
                                },
                                "created_at": {
                                    "type": "integer"
                                },
                                "endpoint": {
                                    "type": "string"
                                },
                                "input_file_id": {
                                    "type": "string"
                                },
                                "object": {
                                    "type": "string",
                                    "const": "batch"
                                },
                                "status": {
                                    "type": "string",
                                    "enum": [
                                        "validating",
                                        "failed",
                                        "in_progress",
                                        "finalizing",
                                        "completed",
                                        "expired",
                                        "cancelling",
                                        "cancelled"
                                    ]
                                },
                                "cancelled_at": {
                                    "type": "integer"
                                },
                                "cancelling_at": {
                                    "type": "integer"
                                },
                                "completed_at": {
                                    "type": "integer"
                                },
                                "error_file_id": {
                                    "type": "string"
                                },
                                "errors": {
                                    "type": "object",
                                    "properties": {
                                        "data": {
                                            "type": "array",
                                            "items": {
                                                "type": "object",
                                                "properties": {
                                                    "code": {
                                                        "type": "string"
                                                    },
                                                    "line": {
                                                        "type": "integer"
                                                    },
                                                    "message": {
                                                        "type": "string"
                                                    },
                                                    "param": {
                                                        "type": "string"
                                                    }
                                                },
                                                "additionalProperties": false,
                                                "title": "BatchError"
                                            }
                                        },
                                        "object": {
                                            "type": "string"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "title": "Errors"
                                },
                                "expired_at": {
                                    "type": "integer"
                                },
                                "expires_at": {
                                    "type": "integer"
                                },
                                "failed_at": {
                                    "type": "integer"
                                },
                                "finalizing_at": {
                                    "type": "integer"
                                },
                                "in_progress_at": {
                                    "type": "integer"
                                },
                                "metadata": {
                                    "type": "object",
                                    "additionalProperties": {
                                        "type": "string"
                                    }
                                },
                                "model": {
                                    "type": "string"
                                },
                                "output_file_id": {
                                    "type": "string"
                                },
                                "request_counts": {
                                    "type": "object",
                                    "properties": {
                                        "completed": {
                                            "type": "integer"
                                        },
                                        "failed": {
                                            "type": "integer"
                                        },
                                        "total": {
                                            "type": "integer"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "required": [
                                        "completed",
                                        "failed",
                                        "total"
                                    ],
                                    "title": "BatchRequestCounts"
                                },
                                "usage": {
                                    "type": "object",
                                    "properties": {
                                        "input_tokens": {
                                            "type": "integer"
                                        },
                                        "input_tokens_details": {
                                            "type": "object",
                                            "properties": {
                                                "cached_tokens": {
                                                    "type": "integer"
                                                }
                                            },
                                            "additionalProperties": false,
                                            "required": [
                                                "cached_tokens"
                                            ],
                                            "title": "InputTokensDetails"
                                        },
                                        "output_tokens": {
                                            "type": "integer"
                                        },
                                        "output_tokens_details": {
                                            "type": "object",
                                            "properties": {
                                                "reasoning_tokens": {
                                                    "type": "integer"
                                                }
                                            },
                                            "additionalProperties": false,
                                            "required": [
                                                "reasoning_tokens"
                                            ],
                                            "title": "OutputTokensDetails"
                                        },
                                        "total_tokens": {
                                            "type": "integer"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "required": [
                                        "input_tokens",
                                        "input_tokens_details",
                                        "output_tokens",
                                        "output_tokens_details",
                                        "total_tokens"
                                    ],
                                    "title": "BatchUsage"
                                }
                            },
                            "additionalProperties": false,
                            "required": [
                                "id",
                                "completion_window",
                                "created_at",
                                "endpoint",
                                "input_file_id",
                                "object",
                                "status"
                            ],
                            "title": "Batch"
                        }
                    },
                    "first_id": {
                        "type": "string"
                    },
                    "last_id": {
                        "type": "string"
                    },
                    "has_more": {
                        "type": "boolean",
                        "default": false
                    }
                },
                "additionalProperties": false,
                "required": [
                    "object",
                    "data",
                    "has_more"
                ],
                "title": "ListBatchesResponse",
                "description": "Response containing a list of batch objects."
            },
            "CreateBatchRequest": {
                "type": "object",
                "properties": {
                    "input_file_id": {
                        "type": "string",
                        "description": "The ID of an uploaded file containing requests for the batch."
                    },
                    "endpoint": {
                        "type": "string",
                        "description": "The endpoint to be used for all requests in the batch."
                    },
                    "completion_window": {
                        "type": "string",
                        "const": "24h",
                        "description": "The time window within which the batch should be processed."
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "string"
                        },
                        "description": "Optional metadata for the batch."
                    },
                    "idempotency_key": {
                        "type": "string",
                        "description": "Optional idempotency key. When provided, enables idempotent behavior."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "input_file_id",
                    "endpoint",
                    "completion_window"
                ],
                "title": "CreateBatchRequest"
            },
            "Batch": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string"
                    },
                    "completion_window": {
                        "type": "string"
                    },
                    "created_at": {
                        "type": "integer"
                    },
                    "endpoint": {
                        "type": "string"
                    },
                    "input_file_id": {
                        "type": "string"
                    },
                    "object": {
                        "type": "string",
                        "const": "batch"
                    },
                    "status": {
                        "type": "string",
                        "enum": [
                            "validating",
                            "failed",
                            "in_progress",
                            "finalizing",
                            "completed",
                            "expired",
                            "cancelling",
                            "cancelled"
                        ]
                    },
                    "cancelled_at": {
                        "type": "integer"
                    },
                    "cancelling_at": {
                        "type": "integer"
                    },
                    "completed_at": {
                        "type": "integer"
                    },
                    "error_file_id": {
                        "type": "string"
                    },
                    "errors": {
                        "type": "object",
                        "properties": {
                            "data": {
                                "type": "array",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "code": {
                                            "type": "string"
                                        },
                                        "line": {
                                            "type": "integer"
                                        },
                                        "message": {
                                            "type": "string"
                                        },
                                        "param": {
                                            "type": "string"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "title": "BatchError"
                                }
                            },
                            "object": {
                                "type": "string"
                            }
                        },
                        "additionalProperties": false,
                        "title": "Errors"
                    },
                    "expired_at": {
                        "type": "integer"
                    },
                    "expires_at": {
                        "type": "integer"
                    },
                    "failed_at": {
                        "type": "integer"
                    },
                    "finalizing_at": {
                        "type": "integer"
                    },
                    "in_progress_at": {
                        "type": "integer"
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "string"
                        }
                    },
                    "model": {
                        "type": "string"
                    },
                    "output_file_id": {
                        "type": "string"
                    },
                    "request_counts": {
                        "type": "object",
                        "properties": {
                            "completed": {
                                "type": "integer"
                            },
                            "failed": {
                                "type": "integer"
                            },
                            "total": {
                                "type": "integer"
                            }
                        },
                        "additionalProperties": false,
                        "required": [
                            "completed",
                            "failed",
                            "total"
                        ],
                        "title": "BatchRequestCounts"
                    },
                    "usage": {
                        "type": "object",
                        "properties": {
                            "input_tokens": {
                                "type": "integer"
                            },
                            "input_tokens_details": {
                                "type": "object",
                                "properties": {
                                    "cached_tokens": {
                                        "type": "integer"
                                    }
                                },
                                "additionalProperties": false,
                                "required": [
                                    "cached_tokens"
                                ],
                                "title": "InputTokensDetails"
                            },
                            "output_tokens": {
                                "type": "integer"
                            },
                            "output_tokens_details": {
                                "type": "object",
                                "properties": {
                                    "reasoning_tokens": {
                                        "type": "integer"
                                    }
                                },
                                "additionalProperties": false,
                                "required": [
                                    "reasoning_tokens"
                                ],
                                "title": "OutputTokensDetails"
                            },
                            "total_tokens": {
                                "type": "integer"
                            }
                        },
                        "additionalProperties": false,
                        "required": [
                            "input_tokens",
                            "input_tokens_details",
                            "output_tokens",
                            "output_tokens_details",
                            "total_tokens"
                        ],
                        "title": "BatchUsage"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "completion_window",
                    "created_at",
                    "endpoint",
                    "input_file_id",
                    "object",
                    "status"
                ],
                "title": "Batch"
            },
            "Order": {
                "type": "string",
                "enum": [
@ -8527,29 +9158,14 @@
            "OpenAIResponseInput": {
                "oneOf": [
                    {
-                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                        "$ref": "#/components/schemas/OpenAIResponseOutput"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMCPApprovalRequest"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMessage"
                    }
@ -8592,16 +9208,53 @@
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentFile"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText",
-                        "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage"
+                        "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage",
                        "input_file": "#/components/schemas/OpenAIResponseInputMessageContentFile"
                    }
                }
            },
            "OpenAIResponseInputMessageContentFile": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "input_file",
                        "default": "input_file",
                        "description": "The type of the input item. Always `input_file`."
                    },
                    "file_data": {
                        "type": "string",
                        "description": "The data of the file to be sent to the model."
                    },
                    "file_id": {
                        "type": "string",
                        "description": "(Optional) The ID of the file to be sent to the model."
                    },
                    "file_url": {
                        "type": "string",
                        "description": "The URL of the file to be sent to the model."
                    },
                    "filename": {
                        "type": "string",
                        "description": "The name of the file to be sent to the model."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type"
                ],
                "title": "OpenAIResponseInputMessageContentFile",
                "description": "File content for input messages in OpenAI response format."
            },
            "OpenAIResponseInputMessageContentImage": {
                "type": "object",
                "properties": {
@ -8629,6 +9282,10 @@
                        "default": "input_image",
                        "description": "Content type identifier, always \"input_image\""
                    },
                    "file_id": {
                        "type": "string",
                        "description": "(Optional) The ID of the file to be sent to the model."
                    },
                    "image_url": {
                        "type": "string",
                        "description": "(Optional) URL of the image content"
@ -8992,6 +9649,10 @@
                        "type": "string",
                        "description": "(Optional) ID of the previous response in a conversation"
                    },
                    "prompt": {
                        "$ref": "#/components/schemas/OpenAIResponsePrompt",
                        "description": "(Optional) Reference to a prompt template and its variables."
                    },
                    "status": {
                        "type": "string",
                        "description": "Current status of the response generation"
@ -9416,6 +10077,32 @@
                "title": "OpenAIResponseOutputMessageWebSearchToolCall",
                "description": "Web search tool call output message for OpenAI responses."
            },
            "OpenAIResponsePrompt": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string",
                        "description": "Unique identifier of the prompt template"
                    },
                    "variables": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/OpenAIResponseInputMessageContent"
                        },
                        "description": "Dictionary of variable names to OpenAIResponseInputMessageContent structure for template substitution. The substitution values can either be strings, or other Response input types like images or files."
                    },
                    "version": {
                        "type": "string",
                        "description": "Version number of the prompt to use (defaults to latest if not specified)"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id"
                ],
                "title": "OpenAIResponsePrompt",
                "description": "OpenAI compatible Prompt object that is used in OpenAI responses."
            },
            "OpenAIResponseText": {
                "type": "object",
                "properties": {
@ -9786,6 +10473,10 @@
                        "type": "string",
                        "description": "The underlying LLM used for completions."
                    },
                    "prompt": {
                        "$ref": "#/components/schemas/OpenAIResponsePrompt",
                        "description": "(Optional) Prompt object with ID, version, and variables."
                    },
                    "instructions": {
                        "type": "string"
                    },
@ -9874,6 +10565,10 @@
                        "type": "string",
                        "description": "(Optional) ID of the previous response in a conversation"
                    },
                    "prompt": {
                        "$ref": "#/components/schemas/OpenAIResponsePrompt",
                        "description": "(Optional) Reference to a prompt template and its variables."
                    },
                    "status": {
                        "type": "string",
                        "description": "Current status of the response generation"
@ -13442,6 +14137,11 @@
            "description": "APIs for creating and interacting with agentic systems.\n\n## Deprecated APIs\n\n> **⚠️ DEPRECATED**: These APIs are provided for migration reference and will be removed in future versions. Not recommended for new projects.\n\n### Migration Guidance\n\nIf you are using deprecated versions of the Agents or Responses APIs, please migrate to:\n\n- **Responses API**: Use the stable v1 Responses API endpoints\n",
            "x-displayName": "Agents"
        },
        {
            "name": "Batches",
            "description": "The API is designed to allow use of openai client libraries for seamless integration.\n\nThis API provides the following extensions:\n - idempotent batch creation\n\nNote: This API is currently under active development and may undergo changes.",
            "x-displayName": "The Batches API enables efficient processing of multiple requests in a single operation, particularly useful for processing large datasets, batch evaluation workflows, and cost-effective inference at scale."
        },
        {
            "name": "Benchmarks",
            "description": ""
@ -13492,6 +14192,7 @@
            "name": "Operations",
            "tags": [
                "Agents",
                "Batches",
                "Benchmarks",
                "DatasetIO",
                "Datasets",
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@ -1012,6 +1012,141 @@ paths:
          schema:
            type: string
      deprecated: true
  /v1/openai/v1/batches:
    get:
      responses:
        '200':
          description: A list of batch objects.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListBatchesResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: List all batches for the current user.
      description: List all batches for the current user.
      parameters:
        - name: after
          in: query
          description: >-
            A cursor for pagination; returns batches after this batch ID.
          required: false
          schema:
            type: string
        - name: limit
          in: query
          description: >-
            Number of batches to return (default 20, max 100).
          required: true
          schema:
            type: integer
      deprecated: true
    post:
      responses:
        '200':
          description: The created batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: >-
        Create a new batch for processing multiple API requests.
      description: >-
        Create a new batch for processing multiple API requests.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateBatchRequest'
        required: true
      deprecated: true
  /v1/openai/v1/batches/{batch_id}:
    get:
      responses:
        '200':
          description: The batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: >-
        Retrieve information about a specific batch.
      description: >-
        Retrieve information about a specific batch.
      parameters:
        - name: batch_id
          in: path
          description: The ID of the batch to retrieve.
          required: true
          schema:
            type: string
      deprecated: true
  /v1/openai/v1/batches/{batch_id}/cancel:
    post:
      responses:
        '200':
          description: The updated batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: Cancel a batch that is in progress.
      description: Cancel a batch that is in progress.
      parameters:
        - name: batch_id
          in: path
          description: The ID of the batch to cancel.
          required: true
          schema:
            type: string
      deprecated: true
  /v1/openai/v1/chat/completions:
    get:
      responses:
@ -2862,7 +2997,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
@ -3253,7 +3387,7 @@ components:
          description: Type of the step in an agent turn.
          const: memory_retrieval
          default: memory_retrieval
-        vector_db_ids:
+        vector_store_ids:
          type: string
          description: >-
            The IDs of the vector databases to retrieve context from.
@ -3266,7 +3400,7 @@ components:
        - turn_id
        - step_id
        - step_type
-        - vector_db_ids
+        - vector_store_ids
        - inserted_context
      title: MemoryRetrievalStep
      description: >-
@ -4737,6 +4871,331 @@ components:
      title: Job
      description: >-
        A job execution instance with status tracking.
    ListBatchesResponse:
      type: object
      properties:
        object:
          type: string
          const: list
          default: list
        data:
          type: array
          items:
            type: object
            properties:
              id:
                type: string
              completion_window:
                type: string
              created_at:
                type: integer
              endpoint:
                type: string
              input_file_id:
                type: string
              object:
                type: string
                const: batch
              status:
                type: string
                enum:
                  - validating
                  - failed
                  - in_progress
                  - finalizing
                  - completed
                  - expired
                  - cancelling
                  - cancelled
              cancelled_at:
                type: integer
              cancelling_at:
                type: integer
              completed_at:
                type: integer
              error_file_id:
                type: string
              errors:
                type: object
                properties:
                  data:
                    type: array
                    items:
                      type: object
                      properties:
                        code:
                          type: string
                        line:
                          type: integer
                        message:
                          type: string
                        param:
                          type: string
                      additionalProperties: false
                      title: BatchError
                  object:
                    type: string
                additionalProperties: false
                title: Errors
              expired_at:
                type: integer
              expires_at:
                type: integer
              failed_at:
                type: integer
              finalizing_at:
                type: integer
              in_progress_at:
                type: integer
              metadata:
                type: object
                additionalProperties:
                  type: string
              model:
                type: string
              output_file_id:
                type: string
              request_counts:
                type: object
                properties:
                  completed:
                    type: integer
                  failed:
                    type: integer
                  total:
                    type: integer
                additionalProperties: false
                required:
                  - completed
                  - failed
                  - total
                title: BatchRequestCounts
              usage:
                type: object
                properties:
                  input_tokens:
                    type: integer
                  input_tokens_details:
                    type: object
                    properties:
                      cached_tokens:
                        type: integer
                    additionalProperties: false
                    required:
                      - cached_tokens
                    title: InputTokensDetails
                  output_tokens:
                    type: integer
                  output_tokens_details:
                    type: object
                    properties:
                      reasoning_tokens:
                        type: integer
                    additionalProperties: false
                    required:
                      - reasoning_tokens
                    title: OutputTokensDetails
                  total_tokens:
                    type: integer
                additionalProperties: false
                required:
                  - input_tokens
                  - input_tokens_details
                  - output_tokens
                  - output_tokens_details
                  - total_tokens
                title: BatchUsage
            additionalProperties: false
            required:
              - id
              - completion_window
              - created_at
              - endpoint
              - input_file_id
              - object
              - status
            title: Batch
        first_id:
          type: string
        last_id:
          type: string
        has_more:
          type: boolean
          default: false
      additionalProperties: false
      required:
        - object
        - data
        - has_more
      title: ListBatchesResponse
      description: >-
        Response containing a list of batch objects.
    CreateBatchRequest:
      type: object
      properties:
        input_file_id:
          type: string
          description: >-
            The ID of an uploaded file containing requests for the batch.
        endpoint:
          type: string
          description: >-
            The endpoint to be used for all requests in the batch.
        completion_window:
          type: string
          const: 24h
          description: >-
            The time window within which the batch should be processed.
        metadata:
          type: object
          additionalProperties:
            type: string
          description: Optional metadata for the batch.
        idempotency_key:
          type: string
          description: >-
            Optional idempotency key. When provided, enables idempotent behavior.
      additionalProperties: false
      required:
        - input_file_id
        - endpoint
        - completion_window
      title: CreateBatchRequest
    Batch:
      type: object
      properties:
        id:
          type: string
        completion_window:
          type: string
        created_at:
          type: integer
        endpoint:
          type: string
        input_file_id:
          type: string
        object:
          type: string
          const: batch
        status:
          type: string
          enum:
            - validating
            - failed
            - in_progress
            - finalizing
            - completed
            - expired
            - cancelling
            - cancelled
        cancelled_at:
          type: integer
        cancelling_at:
          type: integer
        completed_at:
          type: integer
        error_file_id:
          type: string
        errors:
          type: object
          properties:
            data:
              type: array
              items:
                type: object
                properties:
                  code:
                    type: string
                  line:
                    type: integer
                  message:
                    type: string
                  param:
                    type: string
                additionalProperties: false
                title: BatchError
            object:
              type: string
          additionalProperties: false
          title: Errors
        expired_at:
          type: integer
        expires_at:
          type: integer
        failed_at:
          type: integer
        finalizing_at:
          type: integer
        in_progress_at:
          type: integer
        metadata:
          type: object
          additionalProperties:
            type: string
        model:
          type: string
        output_file_id:
          type: string
        request_counts:
          type: object
          properties:
            completed:
              type: integer
            failed:
              type: integer
            total:
              type: integer
          additionalProperties: false
          required:
            - completed
            - failed
            - total
          title: BatchRequestCounts
        usage:
          type: object
          properties:
            input_tokens:
              type: integer
            input_tokens_details:
              type: object
              properties:
                cached_tokens:
                  type: integer
              additionalProperties: false
              required:
                - cached_tokens
              title: InputTokensDetails
            output_tokens:
              type: integer
            output_tokens_details:
              type: object
              properties:
                reasoning_tokens:
                  type: integer
              additionalProperties: false
              required:
                - reasoning_tokens
              title: OutputTokensDetails
            total_tokens:
              type: integer
          additionalProperties: false
          required:
            - input_tokens
            - input_tokens_details
            - output_tokens
            - output_tokens_details
            - total_tokens
          title: BatchUsage
      additionalProperties: false
      required:
        - id
        - completion_window
        - created_at
        - endpoint
        - input_file_id
        - object
        - status
      title: Batch
    Order:
      type: string
      enum:
@ -6370,14 +6829,9 @@ components:
        Error details for failed OpenAI response requests.
    OpenAIResponseInput:
      oneOf:
-        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+        - $ref: '#/components/schemas/OpenAIResponseOutput'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
        - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
        - $ref: '#/components/schemas/OpenAIResponseMessage'
    "OpenAIResponseInputFunctionToolCallOutput":
      type: object
@ -6408,11 +6862,44 @@ components:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentImage'
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentFile'
      discriminator:
        propertyName: type
        mapping:
          input_text: '#/components/schemas/OpenAIResponseInputMessageContentText'
          input_image: '#/components/schemas/OpenAIResponseInputMessageContentImage'
          input_file: '#/components/schemas/OpenAIResponseInputMessageContentFile'
    OpenAIResponseInputMessageContentFile:
      type: object
      properties:
        type:
          type: string
          const: input_file
          default: input_file
          description: >-
            The type of the input item. Always `input_file`.
        file_data:
          type: string
          description: >-
            The data of the file to be sent to the model.
        file_id:
          type: string
          description: >-
            (Optional) The ID of the file to be sent to the model.
        file_url:
          type: string
          description: >-
            The URL of the file to be sent to the model.
        filename:
          type: string
          description: >-
            The name of the file to be sent to the model.
      additionalProperties: false
      required:
        - type
      title: OpenAIResponseInputMessageContentFile
      description: >-
        File content for input messages in OpenAI response format.
    OpenAIResponseInputMessageContentImage:
      type: object
      properties:
@ -6433,6 +6920,10 @@ components:
          default: input_image
          description: >-
            Content type identifier, always "input_image"
        file_id:
          type: string
          description: >-
            (Optional) The ID of the file to be sent to the model.
        image_url:
          type: string
          description: (Optional) URL of the image content
@ -6703,6 +7194,10 @@ components:
          type: string
          description: >-
            (Optional) ID of the previous response in a conversation
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Reference to a prompt template and its variables.
        status:
          type: string
          description: >-
@ -7042,6 +7537,30 @@ components:
        OpenAIResponseOutputMessageWebSearchToolCall
      description: >-
        Web search tool call output message for OpenAI responses.
    OpenAIResponsePrompt:
      type: object
      properties:
        id:
          type: string
          description: Unique identifier of the prompt template
        variables:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/OpenAIResponseInputMessageContent'
          description: >-
            Dictionary of variable names to OpenAIResponseInputMessageContent structure
            for template substitution. The substitution values can either be strings,
            or other Response input types like images or files.
        version:
          type: string
          description: >-
            Version number of the prompt to use (defaults to latest if not specified)
      additionalProperties: false
      required:
        - id
      title: OpenAIResponsePrompt
      description: >-
        OpenAI compatible Prompt object that is used in OpenAI responses.
    OpenAIResponseText:
      type: object
      properties:
@ -7299,6 +7818,10 @@ components:
        model:
          type: string
          description: The underlying LLM used for completions.
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Prompt object with ID, version, and variables.
        instructions:
          type: string
        previous_response_id:
@ -7376,6 +7899,10 @@ components:
          type: string
          description: >-
            (Optional) ID of the previous response in a conversation
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Reference to a prompt template and its variables.
        status:
          type: string
          description: >-
@ -10196,6 +10723,19 @@ tags:
      - **Responses API**: Use the stable v1 Responses API endpoints
    x-displayName: Agents
  - name: Batches
    description: >-
      The API is designed to allow use of openai client libraries for seamless integration.
      This API provides the following extensions:
       - idempotent batch creation
      Note: This API is currently under active development and may undergo changes.
    x-displayName: >-
      The Batches API enables efficient processing of multiple requests in a single
      operation, particularly useful for processing large datasets, batch evaluation
      workflows, and cost-effective inference at scale.
  - name: Benchmarks
    description: ''
  - name: DatasetIO
@ -10241,6 +10781,7 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
      - Batches
      - Benchmarks
      - DatasetIO
      - Datasets
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
@ -2376,7 +2376,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
@ -2866,7 +2865,7 @@
                        "const": "memory_retrieval",
                        "default": "memory_retrieval"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "string",
                        "description": "The IDs of the vector databases to retrieve context from."
                    },
@ -2880,7 +2879,7 @@
                    "turn_id",
                    "step_id",
                    "step_type",
-                    "vector_db_ids",
+                    "vector_store_ids",
                    "inserted_context"
                ],
                "title": "MemoryRetrievalStep",
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@ -1695,7 +1695,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
@ -2086,7 +2085,7 @@ components:
          description: Type of the step in an agent turn.
          const: memory_retrieval
          default: memory_retrieval
-        vector_db_ids:
+        vector_store_ids:
          type: string
          description: >-
            The IDs of the vector databases to retrieve context from.
@ -2099,7 +2098,7 @@ components:
        - turn_id
        - step_id
        - step_type
-        - vector_db_ids
+        - vector_store_ids
        - inserted_context
      title: MemoryRetrievalStep
      description: >-
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
@ -40,6 +40,193 @@
        }
    ],
    "paths": {
        "/v1/batches": {
            "get": {
                "responses": {
                    "200": {
                        "description": "A list of batch objects.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ListBatchesResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "List all batches for the current user.",
                "description": "List all batches for the current user.",
                "parameters": [
                    {
                        "name": "after",
                        "in": "query",
                        "description": "A cursor for pagination; returns batches after this batch ID.",
                        "required": false,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "limit",
                        "in": "query",
                        "description": "Number of batches to return (default 20, max 100).",
                        "required": true,
                        "schema": {
                            "type": "integer"
                        }
                    }
                ],
                "deprecated": false
            },
            "post": {
                "responses": {
                    "200": {
                        "description": "The created batch object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Batch"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "Create a new batch for processing multiple API requests.",
                "description": "Create a new batch for processing multiple API requests.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/CreateBatchRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": false
            }
        },
        "/v1/batches/{batch_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "The batch object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Batch"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "Retrieve information about a specific batch.",
                "description": "Retrieve information about a specific batch.",
                "parameters": [
                    {
                        "name": "batch_id",
                        "in": "path",
                        "description": "The ID of the batch to retrieve.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": false
            }
        },
        "/v1/batches/{batch_id}/cancel": {
            "post": {
                "responses": {
                    "200": {
                        "description": "The updated batch object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Batch"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "Cancel a batch that is in progress.",
                "description": "Cancel a batch that is in progress.",
                "parameters": [
                    {
                        "name": "batch_id",
                        "in": "path",
                        "description": "The ID of the batch to cancel.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": false
            }
        },
        "/v1/chat/completions": {
            "get": {
                "responses": {
@ -4005,6 +4192,451 @@
                "title": "Error",
                "description": "Error response from the API. Roughly follows RFC 7807."
            },
            "ListBatchesResponse": {
                "type": "object",
                "properties": {
                    "object": {
                        "type": "string",
                        "const": "list",
                        "default": "list"
                    },
                    "data": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "id": {
                                    "type": "string"
                                },
                                "completion_window": {
                                    "type": "string"
                                },
                                "created_at": {
                                    "type": "integer"
                                },
                                "endpoint": {
                                    "type": "string"
                                },
                                "input_file_id": {
                                    "type": "string"
                                },
                                "object": {
                                    "type": "string",
                                    "const": "batch"
                                },
                                "status": {
                                    "type": "string",
                                    "enum": [
                                        "validating",
                                        "failed",
                                        "in_progress",
                                        "finalizing",
                                        "completed",
                                        "expired",
                                        "cancelling",
                                        "cancelled"
                                    ]
                                },
                                "cancelled_at": {
                                    "type": "integer"
                                },
                                "cancelling_at": {
                                    "type": "integer"
                                },
                                "completed_at": {
                                    "type": "integer"
                                },
                                "error_file_id": {
                                    "type": "string"
                                },
                                "errors": {
                                    "type": "object",
                                    "properties": {
                                        "data": {
                                            "type": "array",
                                            "items": {
                                                "type": "object",
                                                "properties": {
                                                    "code": {
                                                        "type": "string"
                                                    },
                                                    "line": {
                                                        "type": "integer"
                                                    },
                                                    "message": {
                                                        "type": "string"
                                                    },
                                                    "param": {
                                                        "type": "string"
                                                    }
                                                },
                                                "additionalProperties": false,
                                                "title": "BatchError"
                                            }
                                        },
                                        "object": {
                                            "type": "string"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "title": "Errors"
                                },
                                "expired_at": {
                                    "type": "integer"
                                },
                                "expires_at": {
                                    "type": "integer"
                                },
                                "failed_at": {
                                    "type": "integer"
                                },
                                "finalizing_at": {
                                    "type": "integer"
                                },
                                "in_progress_at": {
                                    "type": "integer"
                                },
                                "metadata": {
                                    "type": "object",
                                    "additionalProperties": {
                                        "type": "string"
                                    }
                                },
                                "model": {
                                    "type": "string"
                                },
                                "output_file_id": {
                                    "type": "string"
                                },
                                "request_counts": {
                                    "type": "object",
                                    "properties": {
                                        "completed": {
                                            "type": "integer"
                                        },
                                        "failed": {
                                            "type": "integer"
                                        },
                                        "total": {
                                            "type": "integer"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "required": [
                                        "completed",
                                        "failed",
                                        "total"
                                    ],
                                    "title": "BatchRequestCounts"
                                },
                                "usage": {
                                    "type": "object",
                                    "properties": {
                                        "input_tokens": {
                                            "type": "integer"
                                        },
                                        "input_tokens_details": {
                                            "type": "object",
                                            "properties": {
                                                "cached_tokens": {
                                                    "type": "integer"
                                                }
                                            },
                                            "additionalProperties": false,
                                            "required": [
                                                "cached_tokens"
                                            ],
                                            "title": "InputTokensDetails"
                                        },
                                        "output_tokens": {
                                            "type": "integer"
                                        },
                                        "output_tokens_details": {
                                            "type": "object",
                                            "properties": {
                                                "reasoning_tokens": {
                                                    "type": "integer"
                                                }
                                            },
                                            "additionalProperties": false,
                                            "required": [
                                                "reasoning_tokens"
                                            ],
                                            "title": "OutputTokensDetails"
                                        },
                                        "total_tokens": {
                                            "type": "integer"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "required": [
                                        "input_tokens",
                                        "input_tokens_details",
                                        "output_tokens",
                                        "output_tokens_details",
                                        "total_tokens"
                                    ],
                                    "title": "BatchUsage"
                                }
                            },
                            "additionalProperties": false,
                            "required": [
                                "id",
                                "completion_window",
                                "created_at",
                                "endpoint",
                                "input_file_id",
                                "object",
                                "status"
                            ],
                            "title": "Batch"
                        }
                    },
                    "first_id": {
                        "type": "string"
                    },
                    "last_id": {
                        "type": "string"
                    },
                    "has_more": {
                        "type": "boolean",
                        "default": false
                    }
                },
                "additionalProperties": false,
                "required": [
                    "object",
                    "data",
                    "has_more"
                ],
                "title": "ListBatchesResponse",
                "description": "Response containing a list of batch objects."
            },
            "CreateBatchRequest": {
                "type": "object",
                "properties": {
                    "input_file_id": {
                        "type": "string",
                        "description": "The ID of an uploaded file containing requests for the batch."
                    },
                    "endpoint": {
                        "type": "string",
                        "description": "The endpoint to be used for all requests in the batch."
                    },
                    "completion_window": {
                        "type": "string",
                        "const": "24h",
                        "description": "The time window within which the batch should be processed."
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "string"
                        },
                        "description": "Optional metadata for the batch."
                    },
                    "idempotency_key": {
                        "type": "string",
                        "description": "Optional idempotency key. When provided, enables idempotent behavior."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "input_file_id",
                    "endpoint",
                    "completion_window"
                ],
                "title": "CreateBatchRequest"
            },
            "Batch": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string"
                    },
                    "completion_window": {
                        "type": "string"
                    },
                    "created_at": {
                        "type": "integer"
                    },
                    "endpoint": {
                        "type": "string"
                    },
                    "input_file_id": {
                        "type": "string"
                    },
                    "object": {
                        "type": "string",
                        "const": "batch"
                    },
                    "status": {
                        "type": "string",
                        "enum": [
                            "validating",
                            "failed",
                            "in_progress",
                            "finalizing",
                            "completed",
                            "expired",
                            "cancelling",
                            "cancelled"
                        ]
                    },
                    "cancelled_at": {
                        "type": "integer"
                    },
                    "cancelling_at": {
                        "type": "integer"
                    },
                    "completed_at": {
                        "type": "integer"
                    },
                    "error_file_id": {
                        "type": "string"
                    },
                    "errors": {
                        "type": "object",
                        "properties": {
                            "data": {
                                "type": "array",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "code": {
                                            "type": "string"
                                        },
                                        "line": {
                                            "type": "integer"
                                        },
                                        "message": {
                                            "type": "string"
                                        },
                                        "param": {
                                            "type": "string"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "title": "BatchError"
                                }
                            },
                            "object": {
                                "type": "string"
                            }
                        },
                        "additionalProperties": false,
                        "title": "Errors"
                    },
                    "expired_at": {
                        "type": "integer"
                    },
                    "expires_at": {
                        "type": "integer"
                    },
                    "failed_at": {
                        "type": "integer"
                    },
                    "finalizing_at": {
                        "type": "integer"
                    },
                    "in_progress_at": {
                        "type": "integer"
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "string"
                        }
                    },
                    "model": {
                        "type": "string"
                    },
                    "output_file_id": {
                        "type": "string"
                    },
                    "request_counts": {
                        "type": "object",
                        "properties": {
                            "completed": {
                                "type": "integer"
                            },
                            "failed": {
                                "type": "integer"
                            },
                            "total": {
                                "type": "integer"
                            }
                        },
                        "additionalProperties": false,
                        "required": [
                            "completed",
                            "failed",
                            "total"
                        ],
                        "title": "BatchRequestCounts"
                    },
                    "usage": {
                        "type": "object",
                        "properties": {
                            "input_tokens": {
                                "type": "integer"
                            },
                            "input_tokens_details": {
                                "type": "object",
                                "properties": {
                                    "cached_tokens": {
                                        "type": "integer"
                                    }
                                },
                                "additionalProperties": false,
                                "required": [
                                    "cached_tokens"
                                ],
                                "title": "InputTokensDetails"
                            },
                            "output_tokens": {
                                "type": "integer"
                            },
                            "output_tokens_details": {
                                "type": "object",
                                "properties": {
                                    "reasoning_tokens": {
                                        "type": "integer"
                                    }
                                },
                                "additionalProperties": false,
                                "required": [
                                    "reasoning_tokens"
                                ],
                                "title": "OutputTokensDetails"
                            },
                            "total_tokens": {
                                "type": "integer"
                            }
                        },
                        "additionalProperties": false,
                        "required": [
                            "input_tokens",
                            "input_tokens_details",
                            "output_tokens",
                            "output_tokens_details",
                            "total_tokens"
                        ],
                        "title": "BatchUsage"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "completion_window",
                    "created_at",
                    "endpoint",
                    "input_file_id",
                    "object",
                    "status"
                ],
                "title": "Batch"
            },
            "Order": {
                "type": "string",
                "enum": [
@ -5696,16 +6328,53 @@
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentFile"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText",
-                        "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage"
+                        "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage",
                        "input_file": "#/components/schemas/OpenAIResponseInputMessageContentFile"
                    }
                }
            },
            "OpenAIResponseInputMessageContentFile": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "input_file",
                        "default": "input_file",
                        "description": "The type of the input item. Always `input_file`."
                    },
                    "file_data": {
                        "type": "string",
                        "description": "The data of the file to be sent to the model."
                    },
                    "file_id": {
                        "type": "string",
                        "description": "(Optional) The ID of the file to be sent to the model."
                    },
                    "file_url": {
                        "type": "string",
                        "description": "The URL of the file to be sent to the model."
                    },
                    "filename": {
                        "type": "string",
                        "description": "The name of the file to be sent to the model."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type"
                ],
                "title": "OpenAIResponseInputMessageContentFile",
                "description": "File content for input messages in OpenAI response format."
            },
            "OpenAIResponseInputMessageContentImage": {
                "type": "object",
                "properties": {
@ -5733,6 +6402,10 @@
                        "default": "input_image",
                        "description": "Content type identifier, always \"input_image\""
                    },
                    "file_id": {
                        "type": "string",
                        "description": "(Optional) The ID of the file to be sent to the model."
                    },
                    "image_url": {
                        "type": "string",
                        "description": "(Optional) URL of the image content"
@ -7305,29 +7978,14 @@
            "OpenAIResponseInput": {
                "oneOf": [
                    {
-                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                        "$ref": "#/components/schemas/OpenAIResponseOutput"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMCPApprovalRequest"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMessage"
                    }
@ -7536,6 +8194,10 @@
                        "type": "string",
                        "description": "(Optional) ID of the previous response in a conversation"
                    },
                    "prompt": {
                        "$ref": "#/components/schemas/OpenAIResponsePrompt",
                        "description": "(Optional) Reference to a prompt template and its variables."
                    },
                    "status": {
                        "type": "string",
                        "description": "Current status of the response generation"
@ -7631,6 +8293,32 @@
                    }
                }
            },
            "OpenAIResponsePrompt": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string",
                        "description": "Unique identifier of the prompt template"
                    },
                    "variables": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/OpenAIResponseInputMessageContent"
                        },
                        "description": "Dictionary of variable names to OpenAIResponseInputMessageContent structure for template substitution. The substitution values can either be strings, or other Response input types like images or files."
                    },
                    "version": {
                        "type": "string",
                        "description": "Version number of the prompt to use (defaults to latest if not specified)"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id"
                ],
                "title": "OpenAIResponsePrompt",
                "description": "OpenAI compatible Prompt object that is used in OpenAI responses."
            },
            "OpenAIResponseText": {
                "type": "object",
                "properties": {
@ -8001,6 +8689,10 @@
                        "type": "string",
                        "description": "The underlying LLM used for completions."
                    },
                    "prompt": {
                        "$ref": "#/components/schemas/OpenAIResponsePrompt",
                        "description": "(Optional) Prompt object with ID, version, and variables."
                    },
                    "instructions": {
                        "type": "string"
                    },
@ -8089,6 +8781,10 @@
                        "type": "string",
                        "description": "(Optional) ID of the previous response in a conversation"
                    },
                    "prompt": {
                        "$ref": "#/components/schemas/OpenAIResponsePrompt",
                        "description": "(Optional) Reference to a prompt template and its variables."
                    },
                    "status": {
                        "type": "string",
                        "description": "Current status of the response generation"
@ -11427,7 +12123,7 @@
                        },
                        "description": "List of documents to index in the RAG system"
                    },
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "ID of the vector database to store the document embeddings"
                    },
@ -11439,7 +12135,7 @@
                "additionalProperties": false,
                "required": [
                    "documents",
-                    "vector_db_id",
+                    "vector_store_id",
                    "chunk_size_in_tokens"
                ],
                "title": "InsertRequest"
@ -11630,7 +12326,7 @@
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The query content to search for in the indexed documents"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "array",
                        "items": {
                            "type": "string"
@ -11645,7 +12341,7 @@
                "additionalProperties": false,
                "required": [
                    "content",
-                    "vector_db_ids"
+                    "vector_store_ids"
                ],
                "title": "QueryRequest"
            },
@ -11833,6 +12529,10 @@
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content of the chunk, which can be interleaved text, images, or other types."
                    },
                    "chunk_id": {
                        "type": "string",
                        "description": "Unique identifier for the chunk. Must be provided explicitly."
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
@ -11866,10 +12566,6 @@
                        },
                        "description": "Optional embedding for the chunk. If not provided, it will be computed later."
                    },
                    "stored_chunk_id": {
                        "type": "string",
                        "description": "The chunk ID that is stored in the vector database. Used for backend functionality."
                    },
                    "chunk_metadata": {
                        "$ref": "#/components/schemas/ChunkMetadata",
                        "description": "Metadata for the chunk that will NOT be used in the context during inference. The `chunk_metadata` is required backend functionality."
@ -11878,6 +12574,7 @@
                "additionalProperties": false,
                "required": [
                    "content",
                    "chunk_id",
                    "metadata"
                ],
                "title": "Chunk",
@ -11938,7 +12635,7 @@
            "InsertChunksRequest": {
                "type": "object",
                "properties": {
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "The identifier of the vector database to insert the chunks into."
                    },
@ -11956,7 +12653,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "vector_db_id",
+                    "vector_store_id",
                    "chunks"
                ],
                "title": "InsertChunksRequest"
@ -11964,7 +12661,7 @@
            "QueryChunksRequest": {
                "type": "object",
                "properties": {
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "The identifier of the vector database to query."
                    },
@ -12001,7 +12698,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "vector_db_id",
+                    "vector_store_id",
                    "query"
                ],
                "title": "QueryChunksRequest"
@ -13224,6 +13921,11 @@
            "description": "APIs for creating and interacting with agentic systems.\n\n## Responses API\n\nThe Responses API provides OpenAI-compatible functionality with enhanced capabilities for dynamic, stateful interactions.\n\n> **✅ STABLE**: This API is production-ready with backward compatibility guarantees. Recommended for production applications.\n\n### ✅ Supported Tools\n\nThe Responses API supports the following tool types:\n\n- **`web_search`**: Search the web for current information and real-time data\n- **`file_search`**: Search through uploaded files and vector stores\n  - Supports dynamic `vector_store_ids` per call\n  - Compatible with OpenAI file search patterns\n- **`function`**: Call custom functions with JSON schema validation\n- **`mcp_tool`**: Model Context Protocol integration\n\n### ✅ Supported Fields & Features\n\n**Core Capabilities:**\n- **Dynamic Configuration**: Switch models, vector stores, and tools per request without pre-configuration\n- **Conversation Branching**: Use `previous_response_id` to branch conversations and explore different paths\n- **Rich Annotations**: Automatic file citations, URL citations, and container file citations\n- **Status Tracking**: Monitor tool call execution status and handle failures gracefully\n\n### 🚧 Work in Progress\n\n- Full real-time response streaming support\n- `tool_choice` parameter\n- `max_tool_calls` parameter\n- Built-in tools (code interpreter, containers API)\n- Safety & guardrails\n- `reasoning` capabilities\n- `service_tier`\n- `logprobs`\n- `max_output_tokens`\n- `metadata` handling\n- `instructions`\n- `incomplete_details`\n- `background`",
            "x-displayName": "Agents"
        },
        {
            "name": "Batches",
            "description": "The API is designed to allow use of openai client libraries for seamless integration.\n\nThis API provides the following extensions:\n - idempotent batch creation\n\nNote: This API is currently under active development and may undergo changes.",
            "x-displayName": "The Batches API enables efficient processing of multiple requests in a single operation, particularly useful for processing large datasets, batch evaluation workflows, and cost-effective inference at scale."
        },
        {
            "name": "Conversations",
            "description": "Protocol for conversation management operations.",
@ -13297,6 +13999,7 @@
            "name": "Operations",
            "tags": [
                "Agents",
                "Batches",
                "Conversations",
                "Files",
                "Inference",
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -12,6 +12,141 @@ info:
 servers:
  - url: http://any-hosted-llama-stack.com
 paths:
  /v1/batches:
    get:
      responses:
        '200':
          description: A list of batch objects.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListBatchesResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: List all batches for the current user.
      description: List all batches for the current user.
      parameters:
        - name: after
          in: query
          description: >-
            A cursor for pagination; returns batches after this batch ID.
          required: false
          schema:
            type: string
        - name: limit
          in: query
          description: >-
            Number of batches to return (default 20, max 100).
          required: true
          schema:
            type: integer
      deprecated: false
    post:
      responses:
        '200':
          description: The created batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: >-
        Create a new batch for processing multiple API requests.
      description: >-
        Create a new batch for processing multiple API requests.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateBatchRequest'
        required: true
      deprecated: false
  /v1/batches/{batch_id}:
    get:
      responses:
        '200':
          description: The batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: >-
        Retrieve information about a specific batch.
      description: >-
        Retrieve information about a specific batch.
      parameters:
        - name: batch_id
          in: path
          description: The ID of the batch to retrieve.
          required: true
          schema:
            type: string
      deprecated: false
  /v1/batches/{batch_id}/cancel:
    post:
      responses:
        '200':
          description: The updated batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: Cancel a batch that is in progress.
      description: Cancel a batch that is in progress.
      parameters:
        - name: batch_id
          in: path
          description: The ID of the batch to cancel.
          required: true
          schema:
            type: string
      deprecated: false
  /v1/chat/completions:
    get:
      responses:
@ -2999,6 +3134,331 @@ components:
      title: Error
      description: >-
        Error response from the API. Roughly follows RFC 7807.
    ListBatchesResponse:
      type: object
      properties:
        object:
          type: string
          const: list
          default: list
        data:
          type: array
          items:
            type: object
            properties:
              id:
                type: string
              completion_window:
                type: string
              created_at:
                type: integer
              endpoint:
                type: string
              input_file_id:
                type: string
              object:
                type: string
                const: batch
              status:
                type: string
                enum:
                  - validating
                  - failed
                  - in_progress
                  - finalizing
                  - completed
                  - expired
                  - cancelling
                  - cancelled
              cancelled_at:
                type: integer
              cancelling_at:
                type: integer
              completed_at:
                type: integer
              error_file_id:
                type: string
              errors:
                type: object
                properties:
                  data:
                    type: array
                    items:
                      type: object
                      properties:
                        code:
                          type: string
                        line:
                          type: integer
                        message:
                          type: string
                        param:
                          type: string
                      additionalProperties: false
                      title: BatchError
                  object:
                    type: string
                additionalProperties: false
                title: Errors
              expired_at:
                type: integer
              expires_at:
                type: integer
              failed_at:
                type: integer
              finalizing_at:
                type: integer
              in_progress_at:
                type: integer
              metadata:
                type: object
                additionalProperties:
                  type: string
              model:
                type: string
              output_file_id:
                type: string
              request_counts:
                type: object
                properties:
                  completed:
                    type: integer
                  failed:
                    type: integer
                  total:
                    type: integer
                additionalProperties: false
                required:
                  - completed
                  - failed
                  - total
                title: BatchRequestCounts
              usage:
                type: object
                properties:
                  input_tokens:
                    type: integer
                  input_tokens_details:
                    type: object
                    properties:
                      cached_tokens:
                        type: integer
                    additionalProperties: false
                    required:
                      - cached_tokens
                    title: InputTokensDetails
                  output_tokens:
                    type: integer
                  output_tokens_details:
                    type: object
                    properties:
                      reasoning_tokens:
                        type: integer
                    additionalProperties: false
                    required:
                      - reasoning_tokens
                    title: OutputTokensDetails
                  total_tokens:
                    type: integer
                additionalProperties: false
                required:
                  - input_tokens
                  - input_tokens_details
                  - output_tokens
                  - output_tokens_details
                  - total_tokens
                title: BatchUsage
            additionalProperties: false
            required:
              - id
              - completion_window
              - created_at
              - endpoint
              - input_file_id
              - object
              - status
            title: Batch
        first_id:
          type: string
        last_id:
          type: string
        has_more:
          type: boolean
          default: false
      additionalProperties: false
      required:
        - object
        - data
        - has_more
      title: ListBatchesResponse
      description: >-
        Response containing a list of batch objects.
    CreateBatchRequest:
      type: object
      properties:
        input_file_id:
          type: string
          description: >-
            The ID of an uploaded file containing requests for the batch.
        endpoint:
          type: string
          description: >-
            The endpoint to be used for all requests in the batch.
        completion_window:
          type: string
          const: 24h
          description: >-
            The time window within which the batch should be processed.
        metadata:
          type: object
          additionalProperties:
            type: string
          description: Optional metadata for the batch.
        idempotency_key:
          type: string
          description: >-
            Optional idempotency key. When provided, enables idempotent behavior.
      additionalProperties: false
      required:
        - input_file_id
        - endpoint
        - completion_window
      title: CreateBatchRequest
    Batch:
      type: object
      properties:
        id:
          type: string
        completion_window:
          type: string
        created_at:
          type: integer
        endpoint:
          type: string
        input_file_id:
          type: string
        object:
          type: string
          const: batch
        status:
          type: string
          enum:
            - validating
            - failed
            - in_progress
            - finalizing
            - completed
            - expired
            - cancelling
            - cancelled
        cancelled_at:
          type: integer
        cancelling_at:
          type: integer
        completed_at:
          type: integer
        error_file_id:
          type: string
        errors:
          type: object
          properties:
            data:
              type: array
              items:
                type: object
                properties:
                  code:
                    type: string
                  line:
                    type: integer
                  message:
                    type: string
                  param:
                    type: string
                additionalProperties: false
                title: BatchError
            object:
              type: string
          additionalProperties: false
          title: Errors
        expired_at:
          type: integer
        expires_at:
          type: integer
        failed_at:
          type: integer
        finalizing_at:
          type: integer
        in_progress_at:
          type: integer
        metadata:
          type: object
          additionalProperties:
            type: string
        model:
          type: string
        output_file_id:
          type: string
        request_counts:
          type: object
          properties:
            completed:
              type: integer
            failed:
              type: integer
            total:
              type: integer
          additionalProperties: false
          required:
            - completed
            - failed
            - total
          title: BatchRequestCounts
        usage:
          type: object
          properties:
            input_tokens:
              type: integer
            input_tokens_details:
              type: object
              properties:
                cached_tokens:
                  type: integer
              additionalProperties: false
              required:
                - cached_tokens
              title: InputTokensDetails
            output_tokens:
              type: integer
            output_tokens_details:
              type: object
              properties:
                reasoning_tokens:
                  type: integer
              additionalProperties: false
              required:
                - reasoning_tokens
              title: OutputTokensDetails
            total_tokens:
              type: integer
          additionalProperties: false
          required:
            - input_tokens
            - input_tokens_details
            - output_tokens
            - output_tokens_details
            - total_tokens
          title: BatchUsage
      additionalProperties: false
      required:
        - id
        - completion_window
        - created_at
        - endpoint
        - input_file_id
        - object
        - status
      title: Batch
    Order:
      type: string
      enum:
@ -4261,11 +4721,44 @@ components:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentImage'
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentFile'
      discriminator:
        propertyName: type
        mapping:
          input_text: '#/components/schemas/OpenAIResponseInputMessageContentText'
          input_image: '#/components/schemas/OpenAIResponseInputMessageContentImage'
          input_file: '#/components/schemas/OpenAIResponseInputMessageContentFile'
    OpenAIResponseInputMessageContentFile:
      type: object
      properties:
        type:
          type: string
          const: input_file
          default: input_file
          description: >-
            The type of the input item. Always `input_file`.
        file_data:
          type: string
          description: >-
            The data of the file to be sent to the model.
        file_id:
          type: string
          description: >-
            (Optional) The ID of the file to be sent to the model.
        file_url:
          type: string
          description: >-
            The URL of the file to be sent to the model.
        filename:
          type: string
          description: >-
            The name of the file to be sent to the model.
      additionalProperties: false
      required:
        - type
      title: OpenAIResponseInputMessageContentFile
      description: >-
        File content for input messages in OpenAI response format.
    OpenAIResponseInputMessageContentImage:
      type: object
      properties:
@ -4286,6 +4779,10 @@ components:
          default: input_image
          description: >-
            Content type identifier, always "input_image"
        file_id:
          type: string
          description: >-
            (Optional) The ID of the file to be sent to the model.
        image_url:
          type: string
          description: (Optional) URL of the image content
@ -5522,14 +6019,9 @@ components:
        Error details for failed OpenAI response requests.
    OpenAIResponseInput:
      oneOf:
-        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+        - $ref: '#/components/schemas/OpenAIResponseOutput'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
        - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
        - $ref: '#/components/schemas/OpenAIResponseMessage'
    OpenAIResponseInputToolFileSearch:
      type: object
@ -5685,6 +6177,10 @@ components:
          type: string
          description: >-
            (Optional) ID of the previous response in a conversation
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Reference to a prompt template and its variables.
        status:
          type: string
          description: >-
@ -5758,6 +6254,30 @@ components:
          mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
          mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
          mcp_approval_request: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
    OpenAIResponsePrompt:
      type: object
      properties:
        id:
          type: string
          description: Unique identifier of the prompt template
        variables:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/OpenAIResponseInputMessageContent'
          description: >-
            Dictionary of variable names to OpenAIResponseInputMessageContent structure
            for template substitution. The substitution values can either be strings,
            or other Response input types like images or files.
        version:
          type: string
          description: >-
            Version number of the prompt to use (defaults to latest if not specified)
      additionalProperties: false
      required:
        - id
      title: OpenAIResponsePrompt
      description: >-
        OpenAI compatible Prompt object that is used in OpenAI responses.
    OpenAIResponseText:
      type: object
      properties:
@ -6015,6 +6535,10 @@ components:
        model:
          type: string
          description: The underlying LLM used for completions.
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Prompt object with ID, version, and variables.
        instructions:
          type: string
        previous_response_id:
@ -6092,6 +6616,10 @@ components:
          type: string
          description: >-
            (Optional) ID of the previous response in a conversation
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Reference to a prompt template and its variables.
        status:
          type: string
          description: >-
@ -8654,7 +9182,7 @@ components:
            $ref: '#/components/schemas/RAGDocument'
          description: >-
            List of documents to index in the RAG system
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            ID of the vector database to store the document embeddings
@ -8665,7 +9193,7 @@ components:
      additionalProperties: false
      required:
        - documents
-        - vector_db_id
+        - vector_store_id
        - chunk_size_in_tokens
      title: InsertRequest
    DefaultRAGQueryGeneratorConfig:
@ -8836,7 +9364,7 @@ components:
          $ref: '#/components/schemas/InterleavedContent'
          description: >-
            The query content to search for in the indexed documents
-        vector_db_ids:
+        vector_store_ids:
          type: array
          items:
            type: string
@ -8849,7 +9377,7 @@ components:
      additionalProperties: false
      required:
        - content
-        - vector_db_ids
+        - vector_store_ids
      title: QueryRequest
    RAGQueryResult:
      type: object
@ -8977,6 +9505,10 @@ components:
          description: >-
            The content of the chunk, which can be interleaved text, images, or other
            types.
        chunk_id:
          type: string
          description: >-
            Unique identifier for the chunk. Must be provided explicitly.
        metadata:
          type: object
          additionalProperties:
@ -8997,10 +9529,6 @@ components:
          description: >-
            Optional embedding for the chunk. If not provided, it will be computed
            later.
        stored_chunk_id:
          type: string
          description: >-
            The chunk ID that is stored in the vector database. Used for backend functionality.
        chunk_metadata:
          $ref: '#/components/schemas/ChunkMetadata'
          description: >-
@ -9009,6 +9537,7 @@ components:
      additionalProperties: false
      required:
        - content
        - chunk_id
        - metadata
      title: Chunk
      description: >-
@ -9073,7 +9602,7 @@ components:
    InsertChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to insert the chunks into.
@ -9092,13 +9621,13 @@ components:
          description: The time to live of the chunks.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - chunks
      title: InsertChunksRequest
    QueryChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to query.
@ -9118,7 +9647,7 @@ components:
          description: The parameters of the query.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - query
      title: QueryChunksRequest
    QueryChunksResponse:
@ -10075,6 +10604,19 @@ tags:
      - `background`
    x-displayName: Agents
  - name: Batches
    description: >-
      The API is designed to allow use of openai client libraries for seamless integration.
      This API provides the following extensions:
       - idempotent batch creation
      Note: This API is currently under active development and may undergo changes.
    x-displayName: >-
      The Batches API enables efficient processing of multiple requests in a single
      operation, particularly useful for processing large datasets, batch evaluation
      workflows, and cost-effective inference at scale.
  - name: Conversations
    description: >-
      Protocol for conversation management operations.
@ -10137,6 +10679,7 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
      - Batches
      - Conversations
      - Files
      - Inference
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
@ -40,6 +40,193 @@
        }
    ],
    "paths": {
        "/v1/batches": {
            "get": {
                "responses": {
                    "200": {
                        "description": "A list of batch objects.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/ListBatchesResponse"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "List all batches for the current user.",
                "description": "List all batches for the current user.",
                "parameters": [
                    {
                        "name": "after",
                        "in": "query",
                        "description": "A cursor for pagination; returns batches after this batch ID.",
                        "required": false,
                        "schema": {
                            "type": "string"
                        }
                    },
                    {
                        "name": "limit",
                        "in": "query",
                        "description": "Number of batches to return (default 20, max 100).",
                        "required": true,
                        "schema": {
                            "type": "integer"
                        }
                    }
                ],
                "deprecated": false
            },
            "post": {
                "responses": {
                    "200": {
                        "description": "The created batch object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Batch"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "Create a new batch for processing multiple API requests.",
                "description": "Create a new batch for processing multiple API requests.",
                "parameters": [],
                "requestBody": {
                    "content": {
                        "application/json": {
                            "schema": {
                                "$ref": "#/components/schemas/CreateBatchRequest"
                            }
                        }
                    },
                    "required": true
                },
                "deprecated": false
            }
        },
        "/v1/batches/{batch_id}": {
            "get": {
                "responses": {
                    "200": {
                        "description": "The batch object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Batch"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "Retrieve information about a specific batch.",
                "description": "Retrieve information about a specific batch.",
                "parameters": [
                    {
                        "name": "batch_id",
                        "in": "path",
                        "description": "The ID of the batch to retrieve.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": false
            }
        },
        "/v1/batches/{batch_id}/cancel": {
            "post": {
                "responses": {
                    "200": {
                        "description": "The updated batch object.",
                        "content": {
                            "application/json": {
                                "schema": {
                                    "$ref": "#/components/schemas/Batch"
                                }
                            }
                        }
                    },
                    "400": {
                        "$ref": "#/components/responses/BadRequest400"
                    },
                    "429": {
                        "$ref": "#/components/responses/TooManyRequests429"
                    },
                    "500": {
                        "$ref": "#/components/responses/InternalServerError500"
                    },
                    "default": {
                        "$ref": "#/components/responses/DefaultError"
                    }
                },
                "tags": [
                    "Batches"
                ],
                "summary": "Cancel a batch that is in progress.",
                "description": "Cancel a batch that is in progress.",
                "parameters": [
                    {
                        "name": "batch_id",
                        "in": "path",
                        "description": "The ID of the batch to cancel.",
                        "required": true,
                        "schema": {
                            "type": "string"
                        }
                    }
                ],
                "deprecated": false
            }
        },
        "/v1/chat/completions": {
            "get": {
                "responses": {
@ -5677,6 +5864,451 @@
                "title": "Error",
                "description": "Error response from the API. Roughly follows RFC 7807."
            },
            "ListBatchesResponse": {
                "type": "object",
                "properties": {
                    "object": {
                        "type": "string",
                        "const": "list",
                        "default": "list"
                    },
                    "data": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "id": {
                                    "type": "string"
                                },
                                "completion_window": {
                                    "type": "string"
                                },
                                "created_at": {
                                    "type": "integer"
                                },
                                "endpoint": {
                                    "type": "string"
                                },
                                "input_file_id": {
                                    "type": "string"
                                },
                                "object": {
                                    "type": "string",
                                    "const": "batch"
                                },
                                "status": {
                                    "type": "string",
                                    "enum": [
                                        "validating",
                                        "failed",
                                        "in_progress",
                                        "finalizing",
                                        "completed",
                                        "expired",
                                        "cancelling",
                                        "cancelled"
                                    ]
                                },
                                "cancelled_at": {
                                    "type": "integer"
                                },
                                "cancelling_at": {
                                    "type": "integer"
                                },
                                "completed_at": {
                                    "type": "integer"
                                },
                                "error_file_id": {
                                    "type": "string"
                                },
                                "errors": {
                                    "type": "object",
                                    "properties": {
                                        "data": {
                                            "type": "array",
                                            "items": {
                                                "type": "object",
                                                "properties": {
                                                    "code": {
                                                        "type": "string"
                                                    },
                                                    "line": {
                                                        "type": "integer"
                                                    },
                                                    "message": {
                                                        "type": "string"
                                                    },
                                                    "param": {
                                                        "type": "string"
                                                    }
                                                },
                                                "additionalProperties": false,
                                                "title": "BatchError"
                                            }
                                        },
                                        "object": {
                                            "type": "string"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "title": "Errors"
                                },
                                "expired_at": {
                                    "type": "integer"
                                },
                                "expires_at": {
                                    "type": "integer"
                                },
                                "failed_at": {
                                    "type": "integer"
                                },
                                "finalizing_at": {
                                    "type": "integer"
                                },
                                "in_progress_at": {
                                    "type": "integer"
                                },
                                "metadata": {
                                    "type": "object",
                                    "additionalProperties": {
                                        "type": "string"
                                    }
                                },
                                "model": {
                                    "type": "string"
                                },
                                "output_file_id": {
                                    "type": "string"
                                },
                                "request_counts": {
                                    "type": "object",
                                    "properties": {
                                        "completed": {
                                            "type": "integer"
                                        },
                                        "failed": {
                                            "type": "integer"
                                        },
                                        "total": {
                                            "type": "integer"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "required": [
                                        "completed",
                                        "failed",
                                        "total"
                                    ],
                                    "title": "BatchRequestCounts"
                                },
                                "usage": {
                                    "type": "object",
                                    "properties": {
                                        "input_tokens": {
                                            "type": "integer"
                                        },
                                        "input_tokens_details": {
                                            "type": "object",
                                            "properties": {
                                                "cached_tokens": {
                                                    "type": "integer"
                                                }
                                            },
                                            "additionalProperties": false,
                                            "required": [
                                                "cached_tokens"
                                            ],
                                            "title": "InputTokensDetails"
                                        },
                                        "output_tokens": {
                                            "type": "integer"
                                        },
                                        "output_tokens_details": {
                                            "type": "object",
                                            "properties": {
                                                "reasoning_tokens": {
                                                    "type": "integer"
                                                }
                                            },
                                            "additionalProperties": false,
                                            "required": [
                                                "reasoning_tokens"
                                            ],
                                            "title": "OutputTokensDetails"
                                        },
                                        "total_tokens": {
                                            "type": "integer"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "required": [
                                        "input_tokens",
                                        "input_tokens_details",
                                        "output_tokens",
                                        "output_tokens_details",
                                        "total_tokens"
                                    ],
                                    "title": "BatchUsage"
                                }
                            },
                            "additionalProperties": false,
                            "required": [
                                "id",
                                "completion_window",
                                "created_at",
                                "endpoint",
                                "input_file_id",
                                "object",
                                "status"
                            ],
                            "title": "Batch"
                        }
                    },
                    "first_id": {
                        "type": "string"
                    },
                    "last_id": {
                        "type": "string"
                    },
                    "has_more": {
                        "type": "boolean",
                        "default": false
                    }
                },
                "additionalProperties": false,
                "required": [
                    "object",
                    "data",
                    "has_more"
                ],
                "title": "ListBatchesResponse",
                "description": "Response containing a list of batch objects."
            },
            "CreateBatchRequest": {
                "type": "object",
                "properties": {
                    "input_file_id": {
                        "type": "string",
                        "description": "The ID of an uploaded file containing requests for the batch."
                    },
                    "endpoint": {
                        "type": "string",
                        "description": "The endpoint to be used for all requests in the batch."
                    },
                    "completion_window": {
                        "type": "string",
                        "const": "24h",
                        "description": "The time window within which the batch should be processed."
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "string"
                        },
                        "description": "Optional metadata for the batch."
                    },
                    "idempotency_key": {
                        "type": "string",
                        "description": "Optional idempotency key. When provided, enables idempotent behavior."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "input_file_id",
                    "endpoint",
                    "completion_window"
                ],
                "title": "CreateBatchRequest"
            },
            "Batch": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string"
                    },
                    "completion_window": {
                        "type": "string"
                    },
                    "created_at": {
                        "type": "integer"
                    },
                    "endpoint": {
                        "type": "string"
                    },
                    "input_file_id": {
                        "type": "string"
                    },
                    "object": {
                        "type": "string",
                        "const": "batch"
                    },
                    "status": {
                        "type": "string",
                        "enum": [
                            "validating",
                            "failed",
                            "in_progress",
                            "finalizing",
                            "completed",
                            "expired",
                            "cancelling",
                            "cancelled"
                        ]
                    },
                    "cancelled_at": {
                        "type": "integer"
                    },
                    "cancelling_at": {
                        "type": "integer"
                    },
                    "completed_at": {
                        "type": "integer"
                    },
                    "error_file_id": {
                        "type": "string"
                    },
                    "errors": {
                        "type": "object",
                        "properties": {
                            "data": {
                                "type": "array",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "code": {
                                            "type": "string"
                                        },
                                        "line": {
                                            "type": "integer"
                                        },
                                        "message": {
                                            "type": "string"
                                        },
                                        "param": {
                                            "type": "string"
                                        }
                                    },
                                    "additionalProperties": false,
                                    "title": "BatchError"
                                }
                            },
                            "object": {
                                "type": "string"
                            }
                        },
                        "additionalProperties": false,
                        "title": "Errors"
                    },
                    "expired_at": {
                        "type": "integer"
                    },
                    "expires_at": {
                        "type": "integer"
                    },
                    "failed_at": {
                        "type": "integer"
                    },
                    "finalizing_at": {
                        "type": "integer"
                    },
                    "in_progress_at": {
                        "type": "integer"
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "string"
                        }
                    },
                    "model": {
                        "type": "string"
                    },
                    "output_file_id": {
                        "type": "string"
                    },
                    "request_counts": {
                        "type": "object",
                        "properties": {
                            "completed": {
                                "type": "integer"
                            },
                            "failed": {
                                "type": "integer"
                            },
                            "total": {
                                "type": "integer"
                            }
                        },
                        "additionalProperties": false,
                        "required": [
                            "completed",
                            "failed",
                            "total"
                        ],
                        "title": "BatchRequestCounts"
                    },
                    "usage": {
                        "type": "object",
                        "properties": {
                            "input_tokens": {
                                "type": "integer"
                            },
                            "input_tokens_details": {
                                "type": "object",
                                "properties": {
                                    "cached_tokens": {
                                        "type": "integer"
                                    }
                                },
                                "additionalProperties": false,
                                "required": [
                                    "cached_tokens"
                                ],
                                "title": "InputTokensDetails"
                            },
                            "output_tokens": {
                                "type": "integer"
                            },
                            "output_tokens_details": {
                                "type": "object",
                                "properties": {
                                    "reasoning_tokens": {
                                        "type": "integer"
                                    }
                                },
                                "additionalProperties": false,
                                "required": [
                                    "reasoning_tokens"
                                ],
                                "title": "OutputTokensDetails"
                            },
                            "total_tokens": {
                                "type": "integer"
                            }
                        },
                        "additionalProperties": false,
                        "required": [
                            "input_tokens",
                            "input_tokens_details",
                            "output_tokens",
                            "output_tokens_details",
                            "total_tokens"
                        ],
                        "title": "BatchUsage"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id",
                    "completion_window",
                    "created_at",
                    "endpoint",
                    "input_file_id",
                    "object",
                    "status"
                ],
                "title": "Batch"
            },
            "Order": {
                "type": "string",
                "enum": [
@ -7368,16 +8000,53 @@
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentImage"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputMessageContentFile"
                    }
                ],
                "discriminator": {
                    "propertyName": "type",
                    "mapping": {
                        "input_text": "#/components/schemas/OpenAIResponseInputMessageContentText",
-                        "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage"
+                        "input_image": "#/components/schemas/OpenAIResponseInputMessageContentImage",
                        "input_file": "#/components/schemas/OpenAIResponseInputMessageContentFile"
                    }
                }
            },
            "OpenAIResponseInputMessageContentFile": {
                "type": "object",
                "properties": {
                    "type": {
                        "type": "string",
                        "const": "input_file",
                        "default": "input_file",
                        "description": "The type of the input item. Always `input_file`."
                    },
                    "file_data": {
                        "type": "string",
                        "description": "The data of the file to be sent to the model."
                    },
                    "file_id": {
                        "type": "string",
                        "description": "(Optional) The ID of the file to be sent to the model."
                    },
                    "file_url": {
                        "type": "string",
                        "description": "The URL of the file to be sent to the model."
                    },
                    "filename": {
                        "type": "string",
                        "description": "The name of the file to be sent to the model."
                    }
                },
                "additionalProperties": false,
                "required": [
                    "type"
                ],
                "title": "OpenAIResponseInputMessageContentFile",
                "description": "File content for input messages in OpenAI response format."
            },
            "OpenAIResponseInputMessageContentImage": {
                "type": "object",
                "properties": {
@ -7405,6 +8074,10 @@
                        "default": "input_image",
                        "description": "Content type identifier, always \"input_image\""
                    },
                    "file_id": {
                        "type": "string",
                        "description": "(Optional) The ID of the file to be sent to the model."
                    },
                    "image_url": {
                        "type": "string",
                        "description": "(Optional) URL of the image content"
@ -8977,29 +9650,14 @@
            "OpenAIResponseInput": {
                "oneOf": [
                    {
-                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
+                        "$ref": "#/components/schemas/OpenAIResponseOutput"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMCPApprovalRequest"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
                    },
                    {
                        "$ref": "#/components/schemas/OpenAIResponseMessage"
                    }
@ -9208,6 +9866,10 @@
                        "type": "string",
                        "description": "(Optional) ID of the previous response in a conversation"
                    },
                    "prompt": {
                        "$ref": "#/components/schemas/OpenAIResponsePrompt",
                        "description": "(Optional) Reference to a prompt template and its variables."
                    },
                    "status": {
                        "type": "string",
                        "description": "Current status of the response generation"
@ -9303,6 +9965,32 @@
                    }
                }
            },
            "OpenAIResponsePrompt": {
                "type": "object",
                "properties": {
                    "id": {
                        "type": "string",
                        "description": "Unique identifier of the prompt template"
                    },
                    "variables": {
                        "type": "object",
                        "additionalProperties": {
                            "$ref": "#/components/schemas/OpenAIResponseInputMessageContent"
                        },
                        "description": "Dictionary of variable names to OpenAIResponseInputMessageContent structure for template substitution. The substitution values can either be strings, or other Response input types like images or files."
                    },
                    "version": {
                        "type": "string",
                        "description": "Version number of the prompt to use (defaults to latest if not specified)"
                    }
                },
                "additionalProperties": false,
                "required": [
                    "id"
                ],
                "title": "OpenAIResponsePrompt",
                "description": "OpenAI compatible Prompt object that is used in OpenAI responses."
            },
            "OpenAIResponseText": {
                "type": "object",
                "properties": {
@ -9673,6 +10361,10 @@
                        "type": "string",
                        "description": "The underlying LLM used for completions."
                    },
                    "prompt": {
                        "$ref": "#/components/schemas/OpenAIResponsePrompt",
                        "description": "(Optional) Prompt object with ID, version, and variables."
                    },
                    "instructions": {
                        "type": "string"
                    },
@ -9761,6 +10453,10 @@
                        "type": "string",
                        "description": "(Optional) ID of the previous response in a conversation"
                    },
                    "prompt": {
                        "$ref": "#/components/schemas/OpenAIResponsePrompt",
                        "description": "(Optional) Reference to a prompt template and its variables."
                    },
                    "status": {
                        "type": "string",
                        "description": "Current status of the response generation"
@ -13099,7 +13795,7 @@
                        },
                        "description": "List of documents to index in the RAG system"
                    },
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "ID of the vector database to store the document embeddings"
                    },
@ -13111,7 +13807,7 @@
                "additionalProperties": false,
                "required": [
                    "documents",
-                    "vector_db_id",
+                    "vector_store_id",
                    "chunk_size_in_tokens"
                ],
                "title": "InsertRequest"
@ -13302,7 +13998,7 @@
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The query content to search for in the indexed documents"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "array",
                        "items": {
                            "type": "string"
@ -13317,7 +14013,7 @@
                "additionalProperties": false,
                "required": [
                    "content",
-                    "vector_db_ids"
+                    "vector_store_ids"
                ],
                "title": "QueryRequest"
            },
@ -13505,6 +14201,10 @@
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The content of the chunk, which can be interleaved text, images, or other types."
                    },
                    "chunk_id": {
                        "type": "string",
                        "description": "Unique identifier for the chunk. Must be provided explicitly."
                    },
                    "metadata": {
                        "type": "object",
                        "additionalProperties": {
@ -13538,10 +14238,6 @@
                        },
                        "description": "Optional embedding for the chunk. If not provided, it will be computed later."
                    },
                    "stored_chunk_id": {
                        "type": "string",
                        "description": "The chunk ID that is stored in the vector database. Used for backend functionality."
                    },
                    "chunk_metadata": {
                        "$ref": "#/components/schemas/ChunkMetadata",
                        "description": "Metadata for the chunk that will NOT be used in the context during inference. The `chunk_metadata` is required backend functionality."
@ -13550,6 +14246,7 @@
                "additionalProperties": false,
                "required": [
                    "content",
                    "chunk_id",
                    "metadata"
                ],
                "title": "Chunk",
@ -13610,7 +14307,7 @@
            "InsertChunksRequest": {
                "type": "object",
                "properties": {
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "The identifier of the vector database to insert the chunks into."
                    },
@ -13628,7 +14325,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "vector_db_id",
+                    "vector_store_id",
                    "chunks"
                ],
                "title": "InsertChunksRequest"
@ -13636,7 +14333,7 @@
            "QueryChunksRequest": {
                "type": "object",
                "properties": {
-                    "vector_db_id": {
+                    "vector_store_id": {
                        "type": "string",
                        "description": "The identifier of the vector database to query."
                    },
@ -13673,7 +14370,7 @@
                },
                "additionalProperties": false,
                "required": [
-                    "vector_db_id",
+                    "vector_store_id",
                    "query"
                ],
                "title": "QueryChunksRequest"
@ -15452,7 +16149,6 @@
                    },
                    "max_tokens": {
                        "type": "integer",
                        "default": 0,
                        "description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
                    },
                    "repetition_penalty": {
@ -15735,7 +16431,7 @@
                        "const": "memory_retrieval",
                        "default": "memory_retrieval"
                    },
-                    "vector_db_ids": {
+                    "vector_store_ids": {
                        "type": "string",
                        "description": "The IDs of the vector databases to retrieve context from."
                    },
@ -15749,7 +16445,7 @@
                    "turn_id",
                    "step_id",
                    "step_type",
-                    "vector_db_ids",
+                    "vector_store_ids",
                    "inserted_context"
                ],
                "title": "MemoryRetrievalStep",
@ -17897,6 +18593,11 @@
            "description": "APIs for creating and interacting with agentic systems.",
            "x-displayName": "Agents"
        },
        {
            "name": "Batches",
            "description": "The API is designed to allow use of openai client libraries for seamless integration.\n\nThis API provides the following extensions:\n - idempotent batch creation\n\nNote: This API is currently under active development and may undergo changes.",
            "x-displayName": "The Batches API enables efficient processing of multiple requests in a single operation, particularly useful for processing large datasets, batch evaluation workflows, and cost-effective inference at scale."
        },
        {
            "name": "Benchmarks",
            "description": ""
@ -17991,6 +18692,7 @@
            "name": "Operations",
            "tags": [
                "Agents",
                "Batches",
                "Benchmarks",
                "Conversations",
                "DatasetIO",
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -15,6 +15,141 @@ info:
 servers:
  - url: http://any-hosted-llama-stack.com
 paths:
  /v1/batches:
    get:
      responses:
        '200':
          description: A list of batch objects.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ListBatchesResponse'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: List all batches for the current user.
      description: List all batches for the current user.
      parameters:
        - name: after
          in: query
          description: >-
            A cursor for pagination; returns batches after this batch ID.
          required: false
          schema:
            type: string
        - name: limit
          in: query
          description: >-
            Number of batches to return (default 20, max 100).
          required: true
          schema:
            type: integer
      deprecated: false
    post:
      responses:
        '200':
          description: The created batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: >-
        Create a new batch for processing multiple API requests.
      description: >-
        Create a new batch for processing multiple API requests.
      parameters: []
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CreateBatchRequest'
        required: true
      deprecated: false
  /v1/batches/{batch_id}:
    get:
      responses:
        '200':
          description: The batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: >-
        Retrieve information about a specific batch.
      description: >-
        Retrieve information about a specific batch.
      parameters:
        - name: batch_id
          in: path
          description: The ID of the batch to retrieve.
          required: true
          schema:
            type: string
      deprecated: false
  /v1/batches/{batch_id}/cancel:
    post:
      responses:
        '200':
          description: The updated batch object.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/Batch'
        '400':
          $ref: '#/components/responses/BadRequest400'
        '429':
          $ref: >-
            #/components/responses/TooManyRequests429
        '500':
          $ref: >-
            #/components/responses/InternalServerError500
        default:
          $ref: '#/components/responses/DefaultError'
      tags:
        - Batches
      summary: Cancel a batch that is in progress.
      description: Cancel a batch that is in progress.
      parameters:
        - name: batch_id
          in: path
          description: The ID of the batch to cancel.
          required: true
          schema:
            type: string
      deprecated: false
  /v1/chat/completions:
    get:
      responses:
@ -4212,6 +4347,331 @@ components:
      title: Error
      description: >-
        Error response from the API. Roughly follows RFC 7807.
    ListBatchesResponse:
      type: object
      properties:
        object:
          type: string
          const: list
          default: list
        data:
          type: array
          items:
            type: object
            properties:
              id:
                type: string
              completion_window:
                type: string
              created_at:
                type: integer
              endpoint:
                type: string
              input_file_id:
                type: string
              object:
                type: string
                const: batch
              status:
                type: string
                enum:
                  - validating
                  - failed
                  - in_progress
                  - finalizing
                  - completed
                  - expired
                  - cancelling
                  - cancelled
              cancelled_at:
                type: integer
              cancelling_at:
                type: integer
              completed_at:
                type: integer
              error_file_id:
                type: string
              errors:
                type: object
                properties:
                  data:
                    type: array
                    items:
                      type: object
                      properties:
                        code:
                          type: string
                        line:
                          type: integer
                        message:
                          type: string
                        param:
                          type: string
                      additionalProperties: false
                      title: BatchError
                  object:
                    type: string
                additionalProperties: false
                title: Errors
              expired_at:
                type: integer
              expires_at:
                type: integer
              failed_at:
                type: integer
              finalizing_at:
                type: integer
              in_progress_at:
                type: integer
              metadata:
                type: object
                additionalProperties:
                  type: string
              model:
                type: string
              output_file_id:
                type: string
              request_counts:
                type: object
                properties:
                  completed:
                    type: integer
                  failed:
                    type: integer
                  total:
                    type: integer
                additionalProperties: false
                required:
                  - completed
                  - failed
                  - total
                title: BatchRequestCounts
              usage:
                type: object
                properties:
                  input_tokens:
                    type: integer
                  input_tokens_details:
                    type: object
                    properties:
                      cached_tokens:
                        type: integer
                    additionalProperties: false
                    required:
                      - cached_tokens
                    title: InputTokensDetails
                  output_tokens:
                    type: integer
                  output_tokens_details:
                    type: object
                    properties:
                      reasoning_tokens:
                        type: integer
                    additionalProperties: false
                    required:
                      - reasoning_tokens
                    title: OutputTokensDetails
                  total_tokens:
                    type: integer
                additionalProperties: false
                required:
                  - input_tokens
                  - input_tokens_details
                  - output_tokens
                  - output_tokens_details
                  - total_tokens
                title: BatchUsage
            additionalProperties: false
            required:
              - id
              - completion_window
              - created_at
              - endpoint
              - input_file_id
              - object
              - status
            title: Batch
        first_id:
          type: string
        last_id:
          type: string
        has_more:
          type: boolean
          default: false
      additionalProperties: false
      required:
        - object
        - data
        - has_more
      title: ListBatchesResponse
      description: >-
        Response containing a list of batch objects.
    CreateBatchRequest:
      type: object
      properties:
        input_file_id:
          type: string
          description: >-
            The ID of an uploaded file containing requests for the batch.
        endpoint:
          type: string
          description: >-
            The endpoint to be used for all requests in the batch.
        completion_window:
          type: string
          const: 24h
          description: >-
            The time window within which the batch should be processed.
        metadata:
          type: object
          additionalProperties:
            type: string
          description: Optional metadata for the batch.
        idempotency_key:
          type: string
          description: >-
            Optional idempotency key. When provided, enables idempotent behavior.
      additionalProperties: false
      required:
        - input_file_id
        - endpoint
        - completion_window
      title: CreateBatchRequest
    Batch:
      type: object
      properties:
        id:
          type: string
        completion_window:
          type: string
        created_at:
          type: integer
        endpoint:
          type: string
        input_file_id:
          type: string
        object:
          type: string
          const: batch
        status:
          type: string
          enum:
            - validating
            - failed
            - in_progress
            - finalizing
            - completed
            - expired
            - cancelling
            - cancelled
        cancelled_at:
          type: integer
        cancelling_at:
          type: integer
        completed_at:
          type: integer
        error_file_id:
          type: string
        errors:
          type: object
          properties:
            data:
              type: array
              items:
                type: object
                properties:
                  code:
                    type: string
                  line:
                    type: integer
                  message:
                    type: string
                  param:
                    type: string
                additionalProperties: false
                title: BatchError
            object:
              type: string
          additionalProperties: false
          title: Errors
        expired_at:
          type: integer
        expires_at:
          type: integer
        failed_at:
          type: integer
        finalizing_at:
          type: integer
        in_progress_at:
          type: integer
        metadata:
          type: object
          additionalProperties:
            type: string
        model:
          type: string
        output_file_id:
          type: string
        request_counts:
          type: object
          properties:
            completed:
              type: integer
            failed:
              type: integer
            total:
              type: integer
          additionalProperties: false
          required:
            - completed
            - failed
            - total
          title: BatchRequestCounts
        usage:
          type: object
          properties:
            input_tokens:
              type: integer
            input_tokens_details:
              type: object
              properties:
                cached_tokens:
                  type: integer
              additionalProperties: false
              required:
                - cached_tokens
              title: InputTokensDetails
            output_tokens:
              type: integer
            output_tokens_details:
              type: object
              properties:
                reasoning_tokens:
                  type: integer
              additionalProperties: false
              required:
                - reasoning_tokens
              title: OutputTokensDetails
            total_tokens:
              type: integer
          additionalProperties: false
          required:
            - input_tokens
            - input_tokens_details
            - output_tokens
            - output_tokens_details
            - total_tokens
          title: BatchUsage
      additionalProperties: false
      required:
        - id
        - completion_window
        - created_at
        - endpoint
        - input_file_id
        - object
        - status
      title: Batch
    Order:
      type: string
      enum:
@ -5474,11 +5934,44 @@ components:
      oneOf:
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentImage'
        - $ref: '#/components/schemas/OpenAIResponseInputMessageContentFile'
      discriminator:
        propertyName: type
        mapping:
          input_text: '#/components/schemas/OpenAIResponseInputMessageContentText'
          input_image: '#/components/schemas/OpenAIResponseInputMessageContentImage'
          input_file: '#/components/schemas/OpenAIResponseInputMessageContentFile'
    OpenAIResponseInputMessageContentFile:
      type: object
      properties:
        type:
          type: string
          const: input_file
          default: input_file
          description: >-
            The type of the input item. Always `input_file`.
        file_data:
          type: string
          description: >-
            The data of the file to be sent to the model.
        file_id:
          type: string
          description: >-
            (Optional) The ID of the file to be sent to the model.
        file_url:
          type: string
          description: >-
            The URL of the file to be sent to the model.
        filename:
          type: string
          description: >-
            The name of the file to be sent to the model.
      additionalProperties: false
      required:
        - type
      title: OpenAIResponseInputMessageContentFile
      description: >-
        File content for input messages in OpenAI response format.
    OpenAIResponseInputMessageContentImage:
      type: object
      properties:
@ -5499,6 +5992,10 @@ components:
          default: input_image
          description: >-
            Content type identifier, always "input_image"
        file_id:
          type: string
          description: >-
            (Optional) The ID of the file to be sent to the model.
        image_url:
          type: string
          description: (Optional) URL of the image content
@ -6735,14 +7232,9 @@ components:
        Error details for failed OpenAI response requests.
    OpenAIResponseInput:
      oneOf:
-        - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
+        - $ref: '#/components/schemas/OpenAIResponseOutput'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
        - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
        - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
        - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
        - $ref: '#/components/schemas/OpenAIResponseMessage'
    OpenAIResponseInputToolFileSearch:
      type: object
@ -6898,6 +7390,10 @@ components:
          type: string
          description: >-
            (Optional) ID of the previous response in a conversation
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Reference to a prompt template and its variables.
        status:
          type: string
          description: >-
@ -6971,6 +7467,30 @@ components:
          mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
          mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
          mcp_approval_request: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
    OpenAIResponsePrompt:
      type: object
      properties:
        id:
          type: string
          description: Unique identifier of the prompt template
        variables:
          type: object
          additionalProperties:
            $ref: '#/components/schemas/OpenAIResponseInputMessageContent'
          description: >-
            Dictionary of variable names to OpenAIResponseInputMessageContent structure
            for template substitution. The substitution values can either be strings,
            or other Response input types like images or files.
        version:
          type: string
          description: >-
            Version number of the prompt to use (defaults to latest if not specified)
      additionalProperties: false
      required:
        - id
      title: OpenAIResponsePrompt
      description: >-
        OpenAI compatible Prompt object that is used in OpenAI responses.
    OpenAIResponseText:
      type: object
      properties:
@ -7228,6 +7748,10 @@ components:
        model:
          type: string
          description: The underlying LLM used for completions.
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Prompt object with ID, version, and variables.
        instructions:
          type: string
        previous_response_id:
@ -7305,6 +7829,10 @@ components:
          type: string
          description: >-
            (Optional) ID of the previous response in a conversation
        prompt:
          $ref: '#/components/schemas/OpenAIResponsePrompt'
          description: >-
            (Optional) Reference to a prompt template and its variables.
        status:
          type: string
          description: >-
@ -9867,7 +10395,7 @@ components:
            $ref: '#/components/schemas/RAGDocument'
          description: >-
            List of documents to index in the RAG system
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            ID of the vector database to store the document embeddings
@ -9878,7 +10406,7 @@ components:
      additionalProperties: false
      required:
        - documents
-        - vector_db_id
+        - vector_store_id
        - chunk_size_in_tokens
      title: InsertRequest
    DefaultRAGQueryGeneratorConfig:
@ -10049,7 +10577,7 @@ components:
          $ref: '#/components/schemas/InterleavedContent'
          description: >-
            The query content to search for in the indexed documents
-        vector_db_ids:
+        vector_store_ids:
          type: array
          items:
            type: string
@ -10062,7 +10590,7 @@ components:
      additionalProperties: false
      required:
        - content
-        - vector_db_ids
+        - vector_store_ids
      title: QueryRequest
    RAGQueryResult:
      type: object
@ -10190,6 +10718,10 @@ components:
          description: >-
            The content of the chunk, which can be interleaved text, images, or other
            types.
        chunk_id:
          type: string
          description: >-
            Unique identifier for the chunk. Must be provided explicitly.
        metadata:
          type: object
          additionalProperties:
@ -10210,10 +10742,6 @@ components:
          description: >-
            Optional embedding for the chunk. If not provided, it will be computed
            later.
        stored_chunk_id:
          type: string
          description: >-
            The chunk ID that is stored in the vector database. Used for backend functionality.
        chunk_metadata:
          $ref: '#/components/schemas/ChunkMetadata'
          description: >-
@ -10222,6 +10750,7 @@ components:
      additionalProperties: false
      required:
        - content
        - chunk_id
        - metadata
      title: Chunk
      description: >-
@ -10286,7 +10815,7 @@ components:
    InsertChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to insert the chunks into.
@ -10305,13 +10834,13 @@ components:
          description: The time to live of the chunks.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - chunks
      title: InsertChunksRequest
    QueryChunksRequest:
      type: object
      properties:
-        vector_db_id:
+        vector_store_id:
          type: string
          description: >-
            The identifier of the vector database to query.
@ -10331,7 +10860,7 @@ components:
          description: The parameters of the query.
      additionalProperties: false
      required:
-        - vector_db_id
+        - vector_store_id
        - query
      title: QueryChunksRequest
    QueryChunksResponse:
@ -11600,7 +12129,6 @@ components:
          description: The sampling strategy.
        max_tokens:
          type: integer
          default: 0
          description: >-
            The maximum number of tokens that can be generated in the completion.
            The token count of your prompt plus max_tokens cannot exceed the model's
@ -11850,7 +12378,7 @@ components:
          description: Type of the step in an agent turn.
          const: memory_retrieval
          default: memory_retrieval
-        vector_db_ids:
+        vector_store_ids:
          type: string
          description: >-
            The IDs of the vector databases to retrieve context from.
@ -11863,7 +12391,7 @@ components:
        - turn_id
        - step_id
        - step_type
-        - vector_db_ids
+        - vector_store_ids
        - inserted_context
      title: MemoryRetrievalStep
      description: >-
@ -13460,6 +13988,19 @@ tags:
    description: >-
      APIs for creating and interacting with agentic systems.
    x-displayName: Agents
  - name: Batches
    description: >-
      The API is designed to allow use of openai client libraries for seamless integration.
      This API provides the following extensions:
       - idempotent batch creation
      Note: This API is currently under active development and may undergo changes.
    x-displayName: >-
      The Batches API enables efficient processing of multiple requests in a single
      operation, particularly useful for processing large datasets, batch evaluation
      workflows, and cost-effective inference at scale.
  - name: Benchmarks
    description: ''
  - name: Conversations
@ -13534,6 +14075,7 @@ x-tagGroups:
  - name: Operations
    tags:
      - Agents
      - Batches
      - Benchmarks
      - Conversations
      - DatasetIO
--- a/llama_stack/apis/telemetry/init.py
+++ b/llama_stack/apis/telemetry/init.py
@ -1,7 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .telemetry import *
--- a/llama_stack/core/telemetry/telemetry.py
+++ b/llama_stack/core/telemetry/telemetry.py
@ -1,250 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 import threading
 from typing import Any
 from opentelemetry import metrics, trace
 from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.metrics import MeterProvider
 from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
 from llama_stack.apis.telemetry import (
    Event,
    MetricEvent,
    SpanEndPayload,
    SpanStartPayload,
    SpanStatus,
    StructuredLogEvent,
    UnstructuredLogEvent,
 )
 from llama_stack.apis.telemetry import (
    Telemetry as TelemetryBase,
 )
 from llama_stack.core.telemetry.tracing import ROOT_SPAN_MARKERS
 from llama_stack.log import get_logger
 _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
    "active_spans": {},
    "counters": {},
    "gauges": {},
    "up_down_counters": {},
 }
 _global_lock = threading.Lock()
 _TRACER_PROVIDER = None
 logger = get_logger(name=__name__, category="telemetry")
 def is_tracing_enabled(tracer):
    with tracer.start_as_current_span("check_tracing") as span:
        return span.is_recording()
 class Telemetry(TelemetryBase):
    def __init__(self) -> None:
        self.meter = None
        global _TRACER_PROVIDER
        # Initialize the correct span processor based on the provider state.
        # This is needed since once the span processor is set, it cannot be unset.
        # Recreating the telemetry adapter multiple times will result in duplicate span processors.
        # Since the library client can be recreated multiple times in a notebook,
        # the kernel will hold on to the span processor and cause duplicate spans to be written.
        if os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
            if _TRACER_PROVIDER is None:
                provider = TracerProvider()
                trace.set_tracer_provider(provider)
                _TRACER_PROVIDER = provider
                # Use single OTLP endpoint for all telemetry signals
                # Let OpenTelemetry SDK handle endpoint construction automatically
                # The SDK will read OTEL_EXPORTER_OTLP_ENDPOINT and construct appropriate URLs
                # https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter
                span_exporter = OTLPSpanExporter()
                span_processor = BatchSpanProcessor(span_exporter)
                trace.get_tracer_provider().add_span_processor(span_processor)
                metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
                metric_provider = MeterProvider(metric_readers=[metric_reader])
                metrics.set_meter_provider(metric_provider)
            self.is_otel_endpoint_set = True
        else:
            logger.warning("OTEL_EXPORTER_OTLP_ENDPOINT is not set, skipping telemetry")
            self.is_otel_endpoint_set = False
        self.meter = metrics.get_meter(__name__)
        self._lock = _global_lock
    async def initialize(self) -> None:
        pass
    async def shutdown(self) -> None:
        if self.is_otel_endpoint_set:
            trace.get_tracer_provider().force_flush()
    async def log_event(self, event: Event, ttl_seconds: int = 604800) -> None:
        if isinstance(event, UnstructuredLogEvent):
            self._log_unstructured(event, ttl_seconds)
        elif isinstance(event, MetricEvent):
            self._log_metric(event)
        elif isinstance(event, StructuredLogEvent):
            self._log_structured(event, ttl_seconds)
        else:
            raise ValueError(f"Unknown event type: {event}")
    def _log_unstructured(self, event: UnstructuredLogEvent, ttl_seconds: int) -> None:
        with self._lock:
            # Use global storage instead of instance storage
            span_id = int(event.span_id, 16)
            span = _GLOBAL_STORAGE["active_spans"].get(span_id)
            if span:
                timestamp_ns = int(event.timestamp.timestamp() * 1e9)
                span.add_event(
                    name=event.type.value,
                    attributes={
                        "message": event.message,
                        "severity": event.severity.value,
                        "__ttl__": ttl_seconds,
                        **(event.attributes or {}),
                    },
                    timestamp=timestamp_ns,
                )
            else:
                print(f"Warning: No active span found for span_id {span_id}. Dropping event: {event}")
    def _get_or_create_counter(self, name: str, unit: str) -> metrics.Counter:
        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["counters"]:
            _GLOBAL_STORAGE["counters"][name] = self.meter.create_counter(
                name=name,
                unit=unit,
                description=f"Counter for {name}",
            )
        return _GLOBAL_STORAGE["counters"][name]
    def _get_or_create_gauge(self, name: str, unit: str) -> metrics.ObservableGauge:
        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["gauges"]:
            _GLOBAL_STORAGE["gauges"][name] = self.meter.create_gauge(
                name=name,
                unit=unit,
                description=f"Gauge for {name}",
            )
        return _GLOBAL_STORAGE["gauges"][name]
    def _log_metric(self, event: MetricEvent) -> None:
        # Add metric as an event to the current span
        try:
            with self._lock:
                # Only try to add to span if we have a valid span_id
                if event.span_id:
                    try:
                        span_id = int(event.span_id, 16)
                        span = _GLOBAL_STORAGE["active_spans"].get(span_id)
                        if span:
                            timestamp_ns = int(event.timestamp.timestamp() * 1e9)
                            span.add_event(
                                name=f"metric.{event.metric}",
                                attributes={
                                    "value": event.value,
                                    "unit": event.unit,
                                    **(event.attributes or {}),
                                },
                                timestamp=timestamp_ns,
                            )
                    except (ValueError, KeyError):
                        # Invalid span_id or span not found, but we already logged to console above
                        pass
        except Exception:
            # Lock acquisition failed
            logger.debug("Failed to acquire lock to add metric to span")
        # Log to OpenTelemetry meter if available
        if self.meter is None:
            return
        if isinstance(event.value, int):
            counter = self._get_or_create_counter(event.metric, event.unit)
            counter.add(event.value, attributes=event.attributes)
        elif isinstance(event.value, float):
            up_down_counter = self._get_or_create_up_down_counter(event.metric, event.unit)
            up_down_counter.add(event.value, attributes=event.attributes)
    def _get_or_create_up_down_counter(self, name: str, unit: str) -> metrics.UpDownCounter:
        assert self.meter is not None
        if name not in _GLOBAL_STORAGE["up_down_counters"]:
            _GLOBAL_STORAGE["up_down_counters"][name] = self.meter.create_up_down_counter(
                name=name,
                unit=unit,
                description=f"UpDownCounter for {name}",
            )
        return _GLOBAL_STORAGE["up_down_counters"][name]
    def _log_structured(self, event: StructuredLogEvent, ttl_seconds: int) -> None:
        with self._lock:
            span_id = int(event.span_id, 16)
            tracer = trace.get_tracer(__name__)
            if event.attributes is None:
                event.attributes = {}
            event.attributes["__ttl__"] = ttl_seconds
            # Extract these W3C trace context attributes so they are not written to
            # underlying storage, as we just need them to propagate the trace context.
            traceparent = event.attributes.pop("traceparent", None)
            tracestate = event.attributes.pop("tracestate", None)
            if traceparent:
                # If we have a traceparent header value, we're not the root span.
                for root_attribute in ROOT_SPAN_MARKERS:
                    event.attributes.pop(root_attribute, None)
            if isinstance(event.payload, SpanStartPayload):
                # Check if span already exists to prevent duplicates
                if span_id in _GLOBAL_STORAGE["active_spans"]:
                    return
                context = None
                if event.payload.parent_span_id:
                    parent_span_id = int(event.payload.parent_span_id, 16)
                    parent_span = _GLOBAL_STORAGE["active_spans"].get(parent_span_id)
                    context = trace.set_span_in_context(parent_span)
                elif traceparent:
                    carrier = {
                        "traceparent": traceparent,
                        "tracestate": tracestate,
                    }
                    context = TraceContextTextMapPropagator().extract(carrier=carrier)
                span = tracer.start_span(
                    name=event.payload.name,
                    context=context,
                    attributes=event.attributes or {},
                )
                _GLOBAL_STORAGE["active_spans"][span_id] = span
            elif isinstance(event.payload, SpanEndPayload):
                span = _GLOBAL_STORAGE["active_spans"].get(span_id)
                if span:
                    if event.attributes:
                        span.set_attributes(event.attributes)
                    status = (
                        trace.Status(status_code=trace.StatusCode.OK)
                        if event.payload.status == SpanStatus.OK
                        else trace.Status(status_code=trace.StatusCode.ERROR)
                    )
                    span.set_status(status)
                    span.end()
                    _GLOBAL_STORAGE["active_spans"].pop(span_id, None)
            else:
                raise ValueError(f"Unknown structured log event: {event}")
--- a/llama_stack/core/utils/context.py
+++ b/llama_stack/core/utils/context.py
@ -1,40 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections.abc import AsyncGenerator
 from contextvars import ContextVar
 def preserve_contexts_async_generator[T](
    gen: AsyncGenerator[T, None], context_vars: list[ContextVar]
 ) -> AsyncGenerator[T, None]:
    """
    Wraps an async generator to preserve context variables across iterations.
    This is needed because we start a new asyncio event loop for each streaming request,
    and we need to preserve the context across the event loop boundary.
    """
    # Capture initial context values
    initial_context_values = {context_var.name: context_var.get() for context_var in context_vars}
    async def wrapper() -> AsyncGenerator[T, None]:
        while True:
            try:
                # Restore context values before any await
                for context_var in context_vars:
                    context_var.set(initial_context_values[context_var.name])
                item = await gen.__anext__()
                # Update our tracked values with any changes made during this iteration
                for context_var in context_vars:
                    initial_context_values[context_var.name] = context_var.get()
                yield item
            except StopAsyncIteration:
                break
    return wrapper()
--- a/llama_stack/providers/remote/inference/nvidia/nvidia.py
+++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@ -1,61 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.log import get_logger
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 from . import NVIDIAConfig
 from .utils import _is_nvidia_hosted
 logger = get_logger(name=__name__, category="inference::nvidia")
 class NVIDIAInferenceAdapter(OpenAIMixin):
    config: NVIDIAConfig
    """
    NVIDIA Inference Adapter for Llama Stack.
    """
    # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
    embedding_model_metadata: dict[str, dict[str, int]] = {
        "nvidia/llama-3.2-nv-embedqa-1b-v2": {"embedding_dimension": 2048, "context_length": 8192},
        "nvidia/nv-embedqa-e5-v5": {"embedding_dimension": 512, "context_length": 1024},
        "nvidia/nv-embedqa-mistral-7b-v2": {"embedding_dimension": 512, "context_length": 4096},
        "snowflake/arctic-embed-l": {"embedding_dimension": 512, "context_length": 1024},
    }
    async def initialize(self) -> None:
        logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.url})...")
        if _is_nvidia_hosted(self.config):
            if not self.config.auth_credential:
                raise RuntimeError(
                    "API key is required for hosted NVIDIA NIM. Either provide an API key or use a self-hosted NIM."
                )
    def get_api_key(self) -> str:
        """
        Get the API key for OpenAI mixin.
        :return: The NVIDIA API key
        """
        if self.config.auth_credential:
            return self.config.auth_credential.get_secret_value()
        if not _is_nvidia_hosted(self.config):
            return "NO KEY REQUIRED"
        return None
    def get_base_url(self) -> str:
        """
        Get the base URL for OpenAI mixin.
        :return: The NVIDIA API base URL
        """
        return f"{self.config.url}/v1" if self.config.append_api_version else self.config.url
--- a/pyproject.toml
+++ b/pyproject.toml
@ -31,7 +31,7 @@ dependencies = [
    "jinja2>=3.1.6",
    "jsonschema",
    "llama-stack-client>=0.3.0",
-    "openai>=1.107",                                  # for expires_after support
+    "openai>=2.5.0",
    "prompt-toolkit",
    "python-dotenv",
    "pyjwt[crypto]>=2.10.0",                          # Pull crypto to support RS256 for jwt. Requires 2.10.0+ for ssl_context support.
@ -67,17 +67,48 @@ dev = [
    "pytest-cov",
    "pytest-html",
    "pytest-json-report",
-    "pytest-socket",       # For blocking network access in unit tests
+    "pytest-socket", # For blocking network access in unit tests
-    "nbval",               # For notebook testing
+    "nbval", # For notebook testing
    "black",
    "ruff",
    "mypy",
    "pre-commit",
    "ruamel.yaml", # needed for openapi generator
 ]
 # Type checking dependencies - includes type stubs and optional runtime dependencies
 # needed for complete mypy coverage across all optional features
 type_checking = [
    "types-requests",
    "types-setuptools",
-    "pre-commit",
+    "types-jsonschema",
-    "ruamel.yaml",         # needed for openapi generator
+    "pandas-stubs",
    "types-psutil",
    "types-tqdm",
    "boto3-stubs[s3]",
    "streamlit",
    "streamlit-option-menu",
    "pandas",
    "anthropic",
    "databricks-sdk",
    "fairscale",
    "torchtune",
    "trl",
    "peft",
    "datasets",
    "together",
    "nest-asyncio",
    "pymongo",
    "torchvision",
    "sqlite-vec",
    "faiss-cpu",
    "lm-format-enforcer",
    "mcp",
    "ollama",
 ]
 # These are the dependencies required for running unit tests.
 unit = [
    "anthropic",
    "databricks-sdk",
    "sqlite-vec",
    "ollama",
    "aiosqlite",
@ -151,7 +182,7 @@ llama = "llama_stack.cli.llama:main"
 install-wheel-from-presigned = "llama_stack.cli.scripts.run:install_wheel_from_presigned"
 [tool.setuptools.packages.find]
-where = ["."]
+where = ["src"]
 include = ["llama_stack", "llama_stack.*"]
 [[tool.uv.index]]
@ -218,17 +249,17 @@ unfixable = [
 # Ignore the following errors for the following files
 [tool.ruff.lint.per-file-ignores]
 "tests/**/*.py" = ["DTZ"] # Ignore datetime rules for tests
-"llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py" = ["RUF001"]
+"src/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py" = ["RUF001"]
-"llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py" = [
+"src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py" = [
    "RUF001",
    "PLE2515",
 ]
-"llama_stack/apis/**/__init__.py" = [
+"src/llama_stack/apis/**/__init__.py" = [
    "F403",
 ] # Using import * is acceptable (or at least tolerated) in an __init__.py of a package API
 [tool.mypy]
-mypy_path = ["llama_stack"]
+mypy_path = ["src"]
 packages = ["llama_stack"]
 plugins = ['pydantic.mypy']
 disable_error_code = []
@ -240,82 +271,91 @@ follow_imports = "silent"
 # to exclude the entire directory.
 exclude = [
    # As we fix more and more of these, we should remove them from the list
-    "^llama_stack.core/build\\.py$",
+    "^src/llama_stack/core/build\\.py$",
-    "^llama_stack.core/client\\.py$",
+    "^src/llama_stack/core/client\\.py$",
-    "^llama_stack.core/request_headers\\.py$",
+    "^src/llama_stack/core/request_headers\\.py$",
-    "^llama_stack.core/routers/",
+    "^src/llama_stack/core/routers/",
-    "^llama_stack.core/routing_tables/",
+    "^src/llama_stack/core/routing_tables/",
-    "^llama_stack.core/server/endpoints\\.py$",
+    "^src/llama_stack/core/server/endpoints\\.py$",
-    "^llama_stack.core/server/server\\.py$",
+    "^src/llama_stack/core/server/server\\.py$",
-    "^llama_stack.core/stack\\.py$",
+    "^src/llama_stack/core/stack\\.py$",
-    "^llama_stack.core/store/registry\\.py$",
+    "^src/llama_stack/core/store/registry\\.py$",
-    "^llama_stack.core/utils/exec\\.py$",
+    "^src/llama_stack/core/utils/exec\\.py$",
-    "^llama_stack.core/utils/prompt_for_config\\.py$",
+    "^src/llama_stack/core/utils/prompt_for_config\\.py$",
-    "^llama_stack/models/llama/llama3/interface\\.py$",
+    "^src/llama_stack/models/llama/llama3/interface\\.py$",
-    "^llama_stack/models/llama/llama3/tokenizer\\.py$",
+    "^src/llama_stack/models/llama/llama3/tokenizer\\.py$",
-    "^llama_stack/models/llama/llama3/tool_utils\\.py$",
+    "^src/llama_stack/models/llama/llama3/tool_utils\\.py$",
-    "^llama_stack/providers/inline/agents/meta_reference/",
+    "^src/llama_stack/providers/inline/datasetio/localfs/",
-    "^llama_stack/providers/inline/datasetio/localfs/",
+    "^src/llama_stack/providers/inline/eval/meta_reference/eval\\.py$",
-    "^llama_stack/providers/inline/eval/meta_reference/eval\\.py$",
+    "^src/llama_stack/providers/inline/inference/meta_reference/inference\\.py$",
-    "^llama_stack/providers/inline/inference/meta_reference/inference\\.py$",
+    "^src/llama_stack/models/llama/llama3/generation\\.py$",
-    "^llama_stack/models/llama/llama3/generation\\.py$",
+    "^src/llama_stack/models/llama/llama3/multimodal/model\\.py$",
-    "^llama_stack/models/llama/llama3/multimodal/model\\.py$",
+    "^src/llama_stack/models/llama/llama4/",
-    "^llama_stack/models/llama/llama4/",
+    "^src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers\\.py$",
-    "^llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers\\.py$",
+    "^src/llama_stack/providers/inline/post_training/common/validator\\.py$",
-    "^llama_stack/providers/inline/post_training/common/validator\\.py$",
+    "^src/llama_stack/providers/inline/safety/code_scanner/",
-    "^llama_stack/providers/inline/safety/code_scanner/",
+    "^src/llama_stack/providers/inline/safety/llama_guard/",
-    "^llama_stack/providers/inline/safety/llama_guard/",
+    "^src/llama_stack/providers/inline/scoring/basic/",
-    "^llama_stack/providers/inline/scoring/basic/",
+    "^src/llama_stack/providers/inline/scoring/braintrust/",
-    "^llama_stack/providers/inline/scoring/braintrust/",
+    "^src/llama_stack/providers/inline/scoring/llm_as_judge/",
-    "^llama_stack/providers/inline/scoring/llm_as_judge/",
+    "^src/llama_stack/providers/remote/agents/sample/",
-    "^llama_stack/providers/remote/agents/sample/",
+    "^src/llama_stack/providers/remote/datasetio/huggingface/",
-    "^llama_stack/providers/remote/datasetio/huggingface/",
+    "^src/llama_stack/providers/remote/datasetio/nvidia/",
-    "^llama_stack/providers/remote/datasetio/nvidia/",
+    "^src/llama_stack/providers/remote/inference/bedrock/",
-    "^llama_stack/providers/remote/inference/bedrock/",
+    "^src/llama_stack/providers/remote/inference/nvidia/",
-    "^llama_stack/providers/remote/inference/nvidia/",
+    "^src/llama_stack/providers/remote/inference/passthrough/",
-    "^llama_stack/providers/remote/inference/passthrough/",
+    "^src/llama_stack/providers/remote/inference/runpod/",
-    "^llama_stack/providers/remote/inference/runpod/",
+    "^src/llama_stack/providers/remote/inference/tgi/",
-    "^llama_stack/providers/remote/inference/tgi/",
+    "^src/llama_stack/providers/remote/inference/watsonx/",
-    "^llama_stack/providers/remote/inference/watsonx/",
+    "^src/llama_stack/providers/remote/safety/bedrock/",
-    "^llama_stack/providers/remote/safety/bedrock/",
+    "^src/llama_stack/providers/remote/safety/nvidia/",
-    "^llama_stack/providers/remote/safety/nvidia/",
+    "^src/llama_stack/providers/remote/safety/sambanova/",
-    "^llama_stack/providers/remote/safety/sambanova/",
+    "^src/llama_stack/providers/remote/safety/sample/",
-    "^llama_stack/providers/remote/safety/sample/",
+    "^src/llama_stack/providers/remote/tool_runtime/bing_search/",
-    "^llama_stack/providers/remote/tool_runtime/bing_search/",
+    "^src/llama_stack/providers/remote/tool_runtime/brave_search/",
-    "^llama_stack/providers/remote/tool_runtime/brave_search/",
+    "^src/llama_stack/providers/remote/tool_runtime/model_context_protocol/",
-    "^llama_stack/providers/remote/tool_runtime/model_context_protocol/",
+    "^src/llama_stack/providers/remote/tool_runtime/tavily_search/",
-    "^llama_stack/providers/remote/tool_runtime/tavily_search/",
+    "^src/llama_stack/providers/remote/tool_runtime/wolfram_alpha/",
-    "^llama_stack/providers/remote/tool_runtime/wolfram_alpha/",
+    "^src/llama_stack/providers/remote/post_training/nvidia/",
-    "^llama_stack/providers/remote/post_training/nvidia/",
+    "^src/llama_stack/providers/remote/vector_io/chroma/",
-    "^llama_stack/providers/remote/vector_io/chroma/",
+    "^src/llama_stack/providers/remote/vector_io/milvus/",
-    "^llama_stack/providers/remote/vector_io/milvus/",
+    "^src/llama_stack/providers/remote/vector_io/pgvector/",
-    "^llama_stack/providers/remote/vector_io/pgvector/",
+    "^src/llama_stack/providers/remote/vector_io/qdrant/",
-    "^llama_stack/providers/remote/vector_io/qdrant/",
+    "^src/llama_stack/providers/remote/vector_io/sample/",
-    "^llama_stack/providers/remote/vector_io/sample/",
+    "^src/llama_stack/providers/remote/vector_io/weaviate/",
-    "^llama_stack/providers/remote/vector_io/weaviate/",
+    "^src/llama_stack/providers/utils/bedrock/client\\.py$",
-    "^llama_stack/providers/utils/bedrock/client\\.py$",
+    "^src/llama_stack/providers/utils/bedrock/refreshable_boto_session\\.py$",
-    "^llama_stack/providers/utils/bedrock/refreshable_boto_session\\.py$",
+    "^src/llama_stack/providers/utils/inference/embedding_mixin\\.py$",
-    "^llama_stack/providers/utils/inference/embedding_mixin\\.py$",
+    "^src/llama_stack/providers/utils/inference/litellm_openai_mixin\\.py$",
-    "^llama_stack/providers/utils/inference/litellm_openai_mixin\\.py$",
+    "^src/llama_stack/providers/utils/inference/model_registry\\.py$",
-    "^llama_stack/providers/utils/inference/model_registry\\.py$",
+    "^src/llama_stack/providers/utils/inference/openai_compat\\.py$",
-    "^llama_stack/providers/utils/inference/openai_compat\\.py$",
+    "^src/llama_stack/providers/utils/inference/prompt_adapter\\.py$",
-    "^llama_stack/providers/utils/inference/prompt_adapter\\.py$",
+    "^src/llama_stack/providers/utils/kvstore/kvstore\\.py$",
-    "^llama_stack/providers/utils/kvstore/kvstore\\.py$",
+    "^src/llama_stack/providers/utils/kvstore/postgres/postgres\\.py$",
-    "^llama_stack/providers/utils/kvstore/postgres/postgres\\.py$",
+    "^src/llama_stack/providers/utils/kvstore/redis/redis\\.py$",
-    "^llama_stack/providers/utils/kvstore/redis/redis\\.py$",
+    "^src/llama_stack/providers/utils/memory/vector_store\\.py$",
-    "^llama_stack/providers/utils/memory/vector_store\\.py$",
+    "^src/llama_stack/providers/utils/scoring/aggregation_utils\\.py$",
-    "^llama_stack/providers/utils/scoring/aggregation_utils\\.py$",
+    "^src/llama_stack/providers/utils/scoring/base_scoring_fn\\.py$",
-    "^llama_stack/providers/utils/scoring/base_scoring_fn\\.py$",
+    "^src/llama_stack/providers/utils/telemetry/dataset_mixin\\.py$",
-    "^llama_stack/providers/utils/telemetry/dataset_mixin\\.py$",
+    "^src/llama_stack/providers/utils/telemetry/trace_protocol\\.py$",
-    "^llama_stack/providers/utils/telemetry/trace_protocol\\.py$",
+    "^src/llama_stack/providers/utils/telemetry/tracing\\.py$",
-    "^llama_stack/providers/utils/telemetry/tracing\\.py$",
+    "^src/llama_stack/strong_typing/auxiliary\\.py$",
-    "^llama_stack/strong_typing/auxiliary\\.py$",
+    "^src/llama_stack/distributions/template\\.py$",
    "^llama_stack/distributions/template\\.py$",
 ]
 [[tool.mypy.overrides]]
 # packages that lack typing annotations, do not have stubs, or are unavailable.
-module = ["yaml", "fire"]
+module = [
    "yaml",
    "fire",
    "torchtune.*",
    "fairscale.*",
    "torchvision.*",
    "datasets",
    "nest_asyncio",
    "streamlit_option_menu",
    "lmformatenforcer.*",
 ]
 ignore_missing_imports = true
 [tool.pydantic-mypy]
--- a/scripts/check-init-py.sh
+++ b/scripts/check-init-py.sh
@ -16,7 +16,7 @@ if (( BASH_VERSINFO[0] < 4 )); then
    exit 1
 fi
-PACKAGE_DIR="${1:-llama_stack}"
+PACKAGE_DIR="${1:-src/llama_stack}"
 if [ ! -d "$PACKAGE_DIR" ]; then
    echo "ERROR: Package directory '$PACKAGE_DIR' does not exist"
--- a/scripts/distro_codegen.py
+++ b/scripts/distro_codegen.py
@ -55,7 +55,7 @@ def process_distro(distro_dir: Path, progress, change_tracker: ChangedPathTracke
        if template_func := getattr(module, "get_distribution_template", None):
            distro = template_func()
-            yaml_output_dir = REPO_ROOT / "llama_stack" / "distributions" / distro.name
+            yaml_output_dir = REPO_ROOT / "src" / "llama_stack" / "distributions" / distro.name
            doc_output_dir = REPO_ROOT / "docs/docs/distributions" / f"{distro.distro_type}_distro"
            change_tracker.add_paths(yaml_output_dir, doc_output_dir)
            distro.save_distribution(
@ -93,7 +93,7 @@ def pre_import_distros(distro_dirs: list[Path]) -> None:
 def main():
-    distros_dir = REPO_ROOT / "llama_stack" / "distributions"
+    distros_dir = REPO_ROOT / "src" / "llama_stack" / "distributions"
    change_tracker = ChangedPathTracker()
    with Progress(
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -30,8 +30,10 @@ materialize_telemetry_configs() {
  local otel_cfg="${dest}/otel-collector-config.yaml"
  local prom_cfg="${dest}/prometheus.yml"
  local graf_cfg="${dest}/grafana-datasources.yaml"
  local graf_dash_cfg="${dest}/grafana-dashboards.yaml"
  local dash_json="${dest}/llama-stack-dashboard.json"
-  for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg"; do
+  for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg" "$graf_dash_cfg" "$dash_json"; do
    if [ -e "$asset" ]; then
      die "Telemetry asset ${asset} already exists; refusing to overwrite"
    fi
@ -103,6 +105,7 @@ datasources:
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    uid: prometheus
    isDefault: true
    editable: true
@ -112,6 +115,224 @@ datasources:
    url: http://jaeger:16686
    editable: true
 EOF
  cat <<'EOF' > "$graf_dash_cfg"
 apiVersion: 1
 providers:
  - name: 'Llama Stack'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    updateIntervalSeconds: 10
    allowUiUpdates: true
    options:
      path: /etc/grafana/provisioning/dashboards
 EOF
  # Copy the dashboard JSON inline to avoid line-length issues
  cat > "$dash_json" <<'DASHBOARD_JSON'
 {
  "annotations": {
    "list": []
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
  "id": null,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "drawStyle": "line",
            "lineInterpolation": "linear",
            "showPoints": "auto",
            "fillOpacity": 10
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [{"color": "green", "value": null}]
          }
        }
      },
      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
      "id": 1,
      "options": {
        "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
        "tooltip": {"mode": "multi", "sort": "none"}
      },
      "targets": [
        {
          "datasource": {"type": "prometheus", "uid": "prometheus"},
          "expr": "llama_stack_completion_tokens_total",
          "legendFormat": "{{model_id}} ({{provider_id}})",
          "refId": "A"
        }
      ],
      "title": "Completion Tokens",
      "type": "timeseries"
    },
    {
      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
      "fieldConfig": {
        "defaults": {
          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
          "mappings": [],
          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
        }
      },
      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
      "id": 2,
      "options": {
        "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
        "tooltip": {"mode": "multi", "sort": "none"}
      },
      "targets": [
        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_prompt_tokens_total", "legendFormat": "Prompt - {{model_id}}", "refId": "A"},
        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_tokens_total", "legendFormat": "Total - {{model_id}}", "refId": "B"}
      ],
      "title": "Prompt & Total Tokens",
      "type": "timeseries"
    },
    {
      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
      "fieldConfig": {
        "defaults": {
          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
          "mappings": [],
          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
          "unit": "ms"
        }
      },
      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
      "id": 3,
      "options": {
        "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
        "tooltip": {"mode": "multi", "sort": "none"}
      },
      "targets": [
        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p95", "refId": "A"},
        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p99", "refId": "B"}
      ],
      "title": "HTTP Request Duration (p95, p99)",
      "type": "timeseries"
    },
    {
      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
      "fieldConfig": {
        "defaults": {
          "mappings": [],
          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
        }
      },
      "gridPos": {"h": 8, "w": 6, "x": 12, "y": 8},
      "id": 4,
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
        "textMode": "auto"
      },
      "targets": [
        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_duration_milliseconds_count)", "refId": "A"}
      ],
      "title": "Total Requests",
      "type": "stat"
    },
    {
      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
      "fieldConfig": {
        "defaults": {
          "mappings": [],
          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
        }
      },
      "gridPos": {"h": 8, "w": 6, "x": 18, "y": 8},
      "id": 5,
      "options": {
        "colorMode": "value",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
        "textMode": "auto"
      },
      "targets": [
        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_active_requests)", "refId": "A"}
      ],
      "title": "Active Requests",
      "type": "stat"
    },
    {
      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
      "fieldConfig": {
        "defaults": {
          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
          "mappings": [],
          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
          "unit": "reqps"
        }
      },
      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
      "id": 6,
      "options": {
        "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
        "tooltip": {"mode": "multi", "sort": "none"}
      },
      "targets": [
        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])", "legendFormat": "{{http_target}} - {{http_status_code}}", "refId": "A"}
      ],
      "title": "Request Rate",
      "type": "timeseries"
    },
    {
      "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
      "fieldConfig": {
        "defaults": {
          "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
          "mappings": [],
          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
          "unit": "Bps"
        }
      },
      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
      "id": 7,
      "options": {
        "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
        "tooltip": {"mode": "multi", "sort": "none"}
      },
      "targets": [
        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])", "legendFormat": "Request", "refId": "A"},
        {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])", "legendFormat": "Response", "refId": "B"}
      ],
      "title": "Request/Response Sizes",
      "type": "timeseries"
    }
  ],
  "refresh": "5s",
  "schemaVersion": 38,
  "tags": ["llama-stack"],
  "templating": {"list": []},
  "time": {"from": "now-15m", "to": "now"},
  "timepicker": {},
  "timezone": "browser",
  "title": "Llama Stack Metrics",
  "uid": "llama-stack-metrics",
  "version": 0,
  "weekStart": ""
 }
 DASHBOARD_JSON
 }
 # Cleanup function to remove temporary files
@ -372,6 +593,8 @@ if [ "$WITH_TELEMETRY" = true ]; then
    -e GF_SECURITY_ADMIN_PASSWORD=admin \
    -e GF_USERS_ALLOW_SIGN_UP=false \
    -v "${TELEMETRY_ASSETS_DIR}/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
    -v "${TELEMETRY_ASSETS_DIR}/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
    -v "${TELEMETRY_ASSETS_DIR}/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
    docker.io/grafana/grafana:11.0.0 > /dev/null 2>&1; then
    die "Grafana startup failed"
  fi
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -208,6 +208,15 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
        echo "=== Starting Llama Stack Server ==="
        export LLAMA_STACK_LOG_WIDTH=120
        # Configure telemetry collector for server mode
        # Use a fixed port for the OTEL collector so the server can connect to it
        COLLECTOR_PORT=4317
        export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
        export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
        export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf"
        export OTEL_BSP_SCHEDULE_DELAY="200"
        export OTEL_BSP_EXPORT_TIMEOUT="2000"
        # remove "server:" from STACK_CONFIG
        stack_config=$(echo "$STACK_CONFIG" | sed 's/^server://')
        nohup llama stack run $stack_config > server.log 2>&1 &
@ -284,10 +293,15 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    docker stop "$container_name" 2>/dev/null || true
    docker rm "$container_name" 2>/dev/null || true
    # Configure telemetry collector port shared between host and container
    COLLECTOR_PORT=4317
    export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
    # Build environment variables for docker run
    DOCKER_ENV_VARS=""
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
    # Pass through API keys if they exist
    [ -n "${TOGETHER_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TOGETHER_API_KEY=$TOGETHER_API_KEY"
@ -308,8 +322,20 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    fi
    echo "Using image: $IMAGE_NAME"
-    docker run -d --network host --name "$container_name" \
+    # On macOS/Darwin, --network host doesn't work as expected due to Docker running in a VM
-        -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+    # Use regular port mapping instead
    NETWORK_MODE=""
    PORT_MAPPINGS=""
    if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then
        NETWORK_MODE="--network host"
    else
        # On non-Linux (macOS, Windows), need explicit port mappings for both app and telemetry
        PORT_MAPPINGS="-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT -p $COLLECTOR_PORT:$COLLECTOR_PORT"
        echo "Using bridge networking with port mapping (non-Linux)"
    fi
    docker run -d $NETWORK_MODE --name "$container_name" \
        $PORT_MAPPINGS \
        $DOCKER_ENV_VARS \
        "$IMAGE_NAME" \
        --port $LLAMA_STACK_PORT
--- a/scripts/run-ui-linter.sh
+++ b/scripts/run-ui-linter.sh
@ -6,7 +6,7 @@
 # the root directory of this source tree.
 set -e
-cd llama_stack/ui
+cd src/llama_stack/ui
 if [ ! -d node_modules ] || [ ! -x node_modules/.bin/prettier ] || [ ! -x node_modules/.bin/eslint ]; then
  echo "UI dependencies not installed, skipping prettier/linter check"
--- a/scripts/telemetry/grafana-dashboards.yaml
+++ b/scripts/telemetry/grafana-dashboards.yaml
@ -0,0 +1,12 @@
 apiVersion: 1
 providers:
  - name: 'Llama Stack'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    updateIntervalSeconds: 10
    allowUiUpdates: true
    options:
      path: /etc/grafana/provisioning/dashboards
--- a/scripts/telemetry/grafana-datasources.yaml
+++ b/scripts/telemetry/grafana-datasources.yaml
@ -5,6 +5,7 @@ datasources:
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    uid: prometheus
    isDefault: true
    editable: true
--- a/scripts/telemetry/llama-stack-dashboard.json
+++ b/scripts/telemetry/llama-stack-dashboard.json
@ -0,0 +1,457 @@
 {
  "annotations": {
    "list": []
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
  "id": null,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "drawStyle": "line",
            "lineInterpolation": "linear",
            "showPoints": "auto",
            "fillOpacity": 10
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          }
        }
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 0
      },
      "id": 1,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "table",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "llama_stack_completion_tokens_total",
          "legendFormat": "{{model_id}} ({{provider_id}})",
          "refId": "A"
        }
      ],
      "title": "Completion Tokens",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "drawStyle": "line",
            "lineInterpolation": "linear",
            "showPoints": "auto",
            "fillOpacity": 10
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          }
        }
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 0
      },
      "id": 2,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "table",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "llama_stack_prompt_tokens_total",
          "legendFormat": "Prompt - {{model_id}}",
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "llama_stack_tokens_total",
          "legendFormat": "Total - {{model_id}}",
          "refId": "B"
        }
      ],
      "title": "Prompt & Total Tokens",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "drawStyle": "line",
            "lineInterpolation": "linear",
            "showPoints": "auto",
            "fillOpacity": 10
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          },
          "unit": "ms"
        }
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 8
      },
      "id": 3,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))",
          "legendFormat": "p95",
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))",
          "legendFormat": "p99",
          "refId": "B"
        }
      ],
      "title": "HTTP Request Duration (p95, p99)",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          }
        }
      },
      "gridPos": {
        "h": 8,
        "w": 6,
        "x": 12,
        "y": 8
      },
      "id": 4,
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "sum(llama_stack_http_server_duration_milliseconds_count)",
          "refId": "A"
        }
      ],
      "title": "Total Requests",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          }
        }
      },
      "gridPos": {
        "h": 8,
        "w": 6,
        "x": 18,
        "y": 8
      },
      "id": 5,
      "options": {
        "colorMode": "value",
        "graphMode": "none",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": [
            "lastNotNull"
          ],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "sum(llama_stack_http_server_active_requests)",
          "refId": "A"
        }
      ],
      "title": "Active Requests",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "drawStyle": "line",
            "lineInterpolation": "linear",
            "showPoints": "auto",
            "fillOpacity": 10
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          },
          "unit": "reqps"
        }
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 16
      },
      "id": 6,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])",
          "legendFormat": "{{http_target}} - {{http_status_code}}",
          "refId": "A"
        }
      ],
      "title": "Request Rate",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "custom": {
            "drawStyle": "line",
            "lineInterpolation": "linear",
            "showPoints": "auto",
            "fillOpacity": 10
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              {
                "color": "green",
                "value": null
              }
            ]
          },
          "unit": "Bps"
        }
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 16
      },
      "id": 7,
      "options": {
        "legend": {
          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
          "showLegend": true
        },
        "tooltip": {
          "mode": "multi",
          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])",
          "legendFormat": "Request",
          "refId": "A"
        },
        {
          "datasource": {
            "type": "prometheus",
            "uid": "prometheus"
          },
          "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])",
          "legendFormat": "Response",
          "refId": "B"
        }
      ],
      "title": "Request/Response Sizes",
      "type": "timeseries"
    }
  ],
  "refresh": "5s",
  "schemaVersion": 38,
  "tags": [
    "llama-stack"
  ],
  "templating": {
    "list": []
  },
  "time": {
    "from": "now-15m",
    "to": "now"
  },
  "timepicker": {},
  "timezone": "browser",
  "title": "Llama Stack Metrics",
  "uid": "llama-stack-metrics",
  "version": 0,
  "weekStart": ""
 }
--- a/scripts/telemetry/setup_telemetry.sh
+++ b/scripts/telemetry/setup_telemetry.sh
@ -135,6 +135,8 @@ $CONTAINER_RUNTIME run -d --name grafana \
  -e GF_SECURITY_ADMIN_PASSWORD=admin \
  -e GF_USERS_ALLOW_SIGN_UP=false \
  -v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
  -v "$SCRIPT_DIR/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
  -v "$SCRIPT_DIR/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
  docker.io/grafana/grafana:11.0.0
 # Wait for services to start
--- a/scripts/unit-tests.sh
+++ b/scripts/unit-tests.sh
@ -27,4 +27,4 @@ fi
 # Run unit tests with coverage
 uv run --python "$PYTHON_VERSION" --with-editable . --group unit \
-    coverage run --source=llama_stack -m pytest -s -v tests/unit/ "$@"
+    coverage run --source=src/llama_stack -m pytest -s -v tests/unit/ "$@"
--- a/src/llama_stack/init.py
+++ b/src/llama_stack/init.py
--- a/src/llama_stack/apis/init.py
+++ b/src/llama_stack/apis/init.py
--- a/src/llama_stack/apis/agents/init.py
+++ b/src/llama_stack/apis/agents/init.py
--- a/src/llama_stack/apis/agents/agents.py
+++ b/src/llama_stack/apis/agents/agents.py
@ -38,6 +38,7 @@ from .openai_responses import (
    OpenAIResponseInputTool,
    OpenAIResponseObject,
    OpenAIResponseObjectStream,
    OpenAIResponsePrompt,
    OpenAIResponseText,
 )
@ -149,13 +150,13 @@ class ShieldCallStep(StepCommon):
 class MemoryRetrievalStep(StepCommon):
    """A memory retrieval step in an agent turn.
-    :param vector_db_ids: The IDs of the vector databases to retrieve context from.
+    :param vector_store_ids: The IDs of the vector databases to retrieve context from.
    :param inserted_context: The context retrieved from the vector databases.
    """
    step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval
    # TODO: should this be List[str]?
-    vector_db_ids: str
+    vector_store_ids: str
    inserted_context: InterleavedContent
@ -810,6 +811,7 @@ class Agents(Protocol):
        self,
        input: str | list[OpenAIResponseInput],
        model: str,
        prompt: OpenAIResponsePrompt | None = None,
        instructions: str | None = None,
        previous_response_id: str | None = None,
        conversation: str | None = None,
@ -831,6 +833,7 @@ class Agents(Protocol):
        :param input: Input message(s) to create the response.
        :param model: The underlying LLM used for completions.
        :param prompt: (Optional) Prompt object with ID, version, and variables.
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
        :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
        :param include: (Optional) Additional fields to include in the response.
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
@ -4,9 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from collections.abc import Sequence
 from typing import Annotated, Any, Literal
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 from typing_extensions import TypedDict
 from llama_stack.apis.vector_io import SearchRankingOptions as FileSearchRankingOptions
@ -46,23 +47,66 @@ class OpenAIResponseInputMessageContentImage(BaseModel):
    :param detail: Level of detail for image processing, can be "low", "high", or "auto"
    :param type: Content type identifier, always "input_image"
    :param file_id: (Optional) The ID of the file to be sent to the model.
    :param image_url: (Optional) URL of the image content
    """
    detail: Literal["low"] | Literal["high"] | Literal["auto"] = "auto"
    type: Literal["input_image"] = "input_image"
-    # TODO: handle file_id
+    file_id: str | None = None
    image_url: str | None = None
-# TODO: handle file content types
+@json_schema_type
 class OpenAIResponseInputMessageContentFile(BaseModel):
    """File content for input messages in OpenAI response format.
    :param type: The type of the input item. Always `input_file`.
    :param file_data: The data of the file to be sent to the model.
    :param file_id: (Optional) The ID of the file to be sent to the model.
    :param file_url: The URL of the file to be sent to the model.
    :param filename: The name of the file to be sent to the model.
    """
    type: Literal["input_file"] = "input_file"
    file_data: str | None = None
    file_id: str | None = None
    file_url: str | None = None
    filename: str | None = None
    @model_validator(mode="after")
    def validate_file_source(self) -> "OpenAIResponseInputMessageContentFile":
        if not any([self.file_data, self.file_id, self.file_url, self.filename]):
            raise ValueError(
                "At least one of 'file_data', 'file_id', 'file_url', or 'filename' must be provided for file content"
            )
        return self
 OpenAIResponseInputMessageContent = Annotated[
-    OpenAIResponseInputMessageContentText | OpenAIResponseInputMessageContentImage,
+    OpenAIResponseInputMessageContentText
    | OpenAIResponseInputMessageContentImage
    | OpenAIResponseInputMessageContentFile,
    Field(discriminator="type"),
 ]
 register_schema(OpenAIResponseInputMessageContent, name="OpenAIResponseInputMessageContent")
@json_schema_type
 class OpenAIResponsePrompt(BaseModel):
    """OpenAI compatible Prompt object that is used in OpenAI responses.
    :param id: Unique identifier of the prompt template
    :param variables: Dictionary of variable names to OpenAIResponseInputMessageContent structure for template substitution. The substitution values can either be strings, or other Response input types
    like images or files.
    :param version: Version number of the prompt to use (defaults to latest if not specified)
    """
    id: str
    variables: dict[str, OpenAIResponseInputMessageContent] | None = None
    version: str | None = None
@json_schema_type
 class OpenAIResponseAnnotationFileCitation(BaseModel):
    """File citation annotation for referencing specific files in response content.
@ -159,7 +203,7 @@ class OpenAIResponseMessage(BaseModel):
    scenarios.
    """
-    content: str | list[OpenAIResponseInputMessageContent] | list[OpenAIResponseOutputMessageContent]
+    content: str | Sequence[OpenAIResponseInputMessageContent] | Sequence[OpenAIResponseOutputMessageContent]
    role: Literal["system"] | Literal["developer"] | Literal["user"] | Literal["assistant"]
    type: Literal["message"] = "message"
@ -211,10 +255,10 @@ class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
    """
    id: str
-    queries: list[str]
+    queries: Sequence[str]
    status: str
    type: Literal["file_search_call"] = "file_search_call"
-    results: list[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None
+    results: Sequence[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None
@json_schema_type
@ -538,6 +582,7 @@ class OpenAIResponseObject(BaseModel):
    :param output: List of generated output items (messages, tool calls, etc.)
    :param parallel_tool_calls: Whether tool calls can be executed in parallel
    :param previous_response_id: (Optional) ID of the previous response in a conversation
    :param prompt: (Optional) Reference to a prompt template and its variables.
    :param status: Current status of the response generation
    :param temperature: (Optional) Sampling temperature used for generation
    :param text: Text formatting configuration for the response
@ -553,16 +598,17 @@ class OpenAIResponseObject(BaseModel):
    id: str
    model: str
    object: Literal["response"] = "response"
-    output: list[OpenAIResponseOutput]
+    output: Sequence[OpenAIResponseOutput]
    parallel_tool_calls: bool = False
    previous_response_id: str | None = None
    prompt: OpenAIResponsePrompt | None = None
    status: str
    temperature: float | None = None
    # Default to text format to avoid breaking the loading of old responses
    # before the field was added. New responses will have this set always.
    text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
    top_p: float | None = None
-    tools: list[OpenAIResponseTool] | None = None
+    tools: Sequence[OpenAIResponseTool] | None = None
    truncation: str | None = None
    usage: OpenAIResponseUsage | None = None
    instructions: str | None = None
@ -1254,14 +1300,9 @@ class OpenAIResponseInputFunctionToolCallOutput(BaseModel):
 OpenAIResponseInput = Annotated[
    # Responses API allows output messages to be passed in as input
-    OpenAIResponseOutputMessageWebSearchToolCall
+    OpenAIResponseOutput
    | OpenAIResponseOutputMessageFileSearchToolCall
    | OpenAIResponseOutputMessageFunctionToolCall
    | OpenAIResponseInputFunctionToolCallOutput
    | OpenAIResponseMCPApprovalRequest
    | OpenAIResponseMCPApprovalResponse
    | OpenAIResponseOutputMessageMCPCall
    | OpenAIResponseOutputMessageMCPListTools
    | OpenAIResponseMessage,
    Field(union_mode="left_to_right"),
 ]
@ -1275,7 +1316,7 @@ class ListOpenAIResponseInputItem(BaseModel):
    :param object: Object type identifier, always "list"
    """
-    data: list[OpenAIResponseInput]
+    data: Sequence[OpenAIResponseInput]
    object: Literal["list"] = "list"
@ -1286,7 +1327,7 @@ class OpenAIResponseObjectWithInput(OpenAIResponseObject):
    :param input: List of input items that led to this response
    """
-    input: list[OpenAIResponseInput]
+    input: Sequence[OpenAIResponseInput]
    def to_response_object(self) -> OpenAIResponseObject:
        """Convert to OpenAIResponseObject by excluding input field."""
@ -1304,7 +1345,7 @@ class ListOpenAIResponseObject(BaseModel):
    :param object: Object type identifier, always "list"
    """
-    data: list[OpenAIResponseObjectWithInput]
+    data: Sequence[OpenAIResponseObjectWithInput]
    has_more: bool
    first_id: str
    last_id: str
--- a/src/llama_stack/apis/batches/init.py
+++ b/src/llama_stack/apis/batches/init.py
--- a/src/llama_stack/apis/batches/batches.py
+++ b/src/llama_stack/apis/batches/batches.py
--- a/src/llama_stack/apis/benchmarks/init.py
+++ b/src/llama_stack/apis/benchmarks/init.py
--- a/src/llama_stack/apis/benchmarks/benchmarks.py
+++ b/src/llama_stack/apis/benchmarks/benchmarks.py
--- a/src/llama_stack/apis/common/init.py
+++ b/src/llama_stack/apis/common/init.py
--- a/src/llama_stack/apis/common/content_types.py
+++ b/src/llama_stack/apis/common/content_types.py
--- a/src/llama_stack/apis/common/errors.py
+++ b/src/llama_stack/apis/common/errors.py
--- a/src/llama_stack/apis/common/job_types.py
+++ b/src/llama_stack/apis/common/job_types.py
--- a/src/llama_stack/apis/common/responses.py
+++ b/src/llama_stack/apis/common/responses.py
--- a/src/llama_stack/apis/common/training_types.py
+++ b/src/llama_stack/apis/common/training_types.py
--- a/src/llama_stack/apis/common/type_system.py
+++ b/src/llama_stack/apis/common/type_system.py
--- a/src/llama_stack/apis/conversations/init.py
+++ b/src/llama_stack/apis/conversations/init.py
--- a/src/llama_stack/apis/conversations/conversations.py
+++ b/src/llama_stack/apis/conversations/conversations.py
--- a/src/llama_stack/apis/datasetio/init.py
+++ b/src/llama_stack/apis/datasetio/init.py
--- a/src/llama_stack/apis/datasetio/datasetio.py
+++ b/src/llama_stack/apis/datasetio/datasetio.py
--- a/src/llama_stack/apis/datasets/init.py
+++ b/src/llama_stack/apis/datasets/init.py
--- a/src/llama_stack/apis/datasets/datasets.py
+++ b/src/llama_stack/apis/datasets/datasets.py
--- a/src/llama_stack/apis/datatypes.py
+++ b/src/llama_stack/apis/datatypes.py
--- a/src/llama_stack/apis/eval/init.py
+++ b/src/llama_stack/apis/eval/init.py
--- a/src/llama_stack/apis/eval/eval.py
+++ b/src/llama_stack/apis/eval/eval.py
--- a/src/llama_stack/apis/files/init.py
+++ b/src/llama_stack/apis/files/init.py
--- a/src/llama_stack/apis/files/files.py
+++ b/src/llama_stack/apis/files/files.py
--- a/src/llama_stack/apis/inference/init.py
+++ b/src/llama_stack/apis/inference/init.py
--- a/src/llama_stack/apis/inference/event_logger.py
+++ b/src/llama_stack/apis/inference/event_logger.py
--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
@ -21,8 +21,8 @@ from typing_extensions import TypedDict
 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.models import Model
 from llama_stack.apis.telemetry import MetricResponseMixin
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
 from llama_stack.core.telemetry.telemetry import MetricResponseMixin
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
@ -97,7 +97,7 @@ class SamplingParams(BaseModel):
    strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)
-    max_tokens: int | None = 0
+    max_tokens: int | None = None
    repetition_penalty: float | None = 1.0
    stop: list[str] | None = None
--- a/src/llama_stack/apis/inspect/init.py
+++ b/src/llama_stack/apis/inspect/init.py
--- a/src/llama_stack/apis/inspect/inspect.py
+++ b/src/llama_stack/apis/inspect/inspect.py
--- a/src/llama_stack/apis/models/init.py
+++ b/src/llama_stack/apis/models/init.py
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
--- a/src/llama_stack/apis/post_training/init.py
+++ b/src/llama_stack/apis/post_training/init.py
--- a/src/llama_stack/apis/post_training/post_training.py
+++ b/src/llama_stack/apis/post_training/post_training.py
--- a/src/llama_stack/apis/prompts/init.py
+++ b/src/llama_stack/apis/prompts/init.py
--- a/src/llama_stack/apis/prompts/prompts.py
+++ b/src/llama_stack/apis/prompts/prompts.py
--- a/src/llama_stack/apis/providers/init.py
+++ b/src/llama_stack/apis/providers/init.py
--- a/src/llama_stack/apis/providers/providers.py
+++ b/src/llama_stack/apis/providers/providers.py
--- a/Show more
+++ b/Show more