From a8aa815b6a194fad76e4e5f7e73faf588b5d0e01 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 9 Sep 2025 15:50:56 -0700
Subject: [PATCH 01/15] feat(tests): migrate to global "setups" system for test
 configuration (#3390)

This PR refactors the integration test system to use global "setups"
which provides better separation of concerns:

**suites = what to test, setups = how to configure.**

NOTE: if you naming suggestions, please provide feedback

Changes:
- New `tests/integration/setups.py` with global, reusable configurations
(ollama, vllm, gpt, claude)
- Modified `scripts/integration-tests.sh` options to match with the
underlying pytest options
    - Updated documentation to reflect the new global setup system

The main benefit is that setups can be reused across multiple suites
(e.g., use "gpt" with any suite) even though sometimes they could
specifically tailored for a suite (vision <> ollama-vision). It is now
easier to add new configurations without modifying existing suites.

Usage examples:
    - `pytest tests/integration --suite=responses --setup=gpt`
- `pytest tests/integration --suite=vision` # auto-selects
"ollama-vision" setup
    - `pytest tests/integration --suite=base --setup=vllm`
---
 .../actions/run-and-record-tests/action.yml   |  42 +++---
 .github/actions/setup-ollama/action.yml       |   4 +-
 .../actions/setup-test-environment/action.yml |  14 +-
 .github/workflows/integration-tests.yml       |  20 +--
 .../workflows/record-integration-tests.yml    |  32 ++---
 scripts/get_setup_env.py                      |  71 ++++++++++
 scripts/github/schedule-record-workflow.sh    |  57 ++++----
 scripts/integration-tests.sh                  | 106 ++++++++-------
 tests/integration/README.md                   |  48 ++++---
 tests/integration/conftest.py                 |  71 ++++++----
 tests/integration/suites.py                   | 126 +++++++++++++-----
 11 files changed, 385 insertions(+), 206 deletions(-)
 create mode 100644 scripts/get_setup_env.py

diff --git a/.github/actions/run-and-record-tests/action.yml b/.github/actions/run-and-record-tests/action.yml
index 7f028b104..a3eb31d9f 100644
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@@ -5,21 +5,22 @@ inputs:
   stack-config:
     description: 'Stack configuration to use'
     required: true
-  provider:
-    description: 'Provider to use for tests'
-    required: true
+  setup:
+    description: 'Setup to use for tests (e.g., ollama, gpt, vllm)'
+    required: false
+    default: ''
   inference-mode:
     description: 'Inference mode (record or replay)'
     required: true
-  test-suite:
+  suite:
     description: 'Test suite to use: base, responses, vision, etc.'
     required: false
     default: ''
-  test-subdirs:
-    description: 'Comma-separated list of test subdirectories to run; overrides test-suite'
+  subdirs:
+    description: 'Comma-separated list of test subdirectories to run; overrides suite'
     required: false
     default: ''
-  test-pattern:
+  pattern:
     description: 'Regex pattern to pass to pytest -k'
     required: false
     default: ''
@@ -37,14 +38,23 @@ runs:
     - name: Run Integration Tests
       shell: bash
       run: |
-        uv run --no-sync ./scripts/integration-tests.sh \
-          --stack-config '${{ inputs.stack-config }}' \
-          --provider '${{ inputs.provider }}' \
-          --test-subdirs '${{ inputs.test-subdirs }}' \
-          --test-pattern '${{ inputs.test-pattern }}' \
-          --inference-mode '${{ inputs.inference-mode }}' \
-          --test-suite '${{ inputs.test-suite }}' \
-          | tee pytest-${{ inputs.inference-mode }}.log
+        SCRIPT_ARGS="--stack-config ${{ inputs.stack-config }} --inference-mode ${{ inputs.inference-mode }}"
+
+        # Add optional arguments only if they are provided
+        if [ -n '${{ inputs.setup }}' ]; then
+          SCRIPT_ARGS="$SCRIPT_ARGS --setup ${{ inputs.setup }}"
+        fi
+        if [ -n '${{ inputs.suite }}' ]; then
+          SCRIPT_ARGS="$SCRIPT_ARGS --suite ${{ inputs.suite }}"
+        fi
+        if [ -n '${{ inputs.subdirs }}' ]; then
+          SCRIPT_ARGS="$SCRIPT_ARGS --subdirs ${{ inputs.subdirs }}"
+        fi
+        if [ -n '${{ inputs.pattern }}' ]; then
+          SCRIPT_ARGS="$SCRIPT_ARGS --pattern ${{ inputs.pattern }}"
+        fi
+
+        uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS | tee pytest-${{ inputs.inference-mode }}.log
 
 
     - name: Commit and push recordings
@@ -58,7 +68,7 @@ runs:
           echo "New recordings detected, committing and pushing"
           git add tests/integration/recordings/
 
-          git commit -m "Recordings update from CI (test-suite: ${{ inputs.test-suite }})"
+          git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
           git fetch origin ${{ github.ref_name }}
           git rebase origin/${{ github.ref_name }}
           echo "Rebased successfully"
diff --git a/.github/actions/setup-ollama/action.yml b/.github/actions/setup-ollama/action.yml
index dc2f87e8c..5c95d131d 100644
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@@ -1,7 +1,7 @@
 name: Setup Ollama
 description: Start Ollama
 inputs:
-  test-suite:
+  suite:
     description: 'Test suite to use: base, responses, vision, etc.'
     required: false
     default: ''
@@ -11,7 +11,7 @@ runs:
     - name: Start Ollama
       shell: bash
       run: |
-        if [ "${{ inputs.test-suite }}" == "vision" ]; then
+        if [ "${{ inputs.suite }}" == "vision" ]; then
           image="ollama-with-vision-model"
         else
           image="ollama-with-models"
diff --git a/.github/actions/setup-test-environment/action.yml b/.github/actions/setup-test-environment/action.yml
index 3be76f009..478e8f598 100644
--- a/.github/actions/setup-test-environment/action.yml
+++ b/.github/actions/setup-test-environment/action.yml
@@ -8,11 +8,11 @@ inputs:
   client-version:
     description: 'Client version (latest or published)'
     required: true
-  provider:
-    description: 'Provider to setup (ollama or vllm)'
-    required: true
+  setup:
+    description: 'Setup to configure (ollama, vllm, gpt, etc.)'
+    required: false
     default: 'ollama'
-  test-suite:
+  suite:
     description: 'Test suite to use: base, responses, vision, etc.'
     required: false
     default: ''
@@ -30,13 +30,13 @@ runs:
         client-version: ${{ inputs.client-version }}
 
     - name: Setup ollama
-      if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }}
+      if: ${{ (inputs.setup == 'ollama' || inputs.setup == 'ollama-vision') && inputs.inference-mode == 'record' }}
       uses: ./.github/actions/setup-ollama
       with:
-        test-suite: ${{ inputs.test-suite }}
+        suite: ${{ inputs.suite }}
 
     - name: Setup vllm
-      if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }}
+      if: ${{ inputs.setup == 'vllm' && inputs.inference-mode == 'record' }}
       uses: ./.github/actions/setup-vllm
 
     - name: Build Llama Stack
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
index bb53eea2f..711eccd9e 100644
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -28,8 +28,8 @@ on:
         description: 'Test against both the latest and published versions'
         type: boolean
         default: false
-      test-provider:
-        description: 'Test against a specific provider'
+      test-setup:
+        description: 'Test against a specific setup'
         type: string
         default: 'ollama'
 
@@ -42,18 +42,18 @@ jobs:
 
   run-replay-mode-tests:
     runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.test-suite) }}
+    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}
 
     strategy:
       fail-fast: false
       matrix:
         client-type: [library, server]
-        # Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
-        provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
+        # Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
+        setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
         # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
         python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
         client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        test-suite: [base, vision]
+        suite: [base, vision]
 
     steps:
       - name: Checkout repository
@@ -64,14 +64,14 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           client-version: ${{ matrix.client-version }}
-          provider: ${{ matrix.provider }}
-          test-suite: ${{ matrix.test-suite }}
+          setup: ${{ matrix.setup }}
+          suite: ${{ matrix.suite }}
           inference-mode: 'replay'
 
       - name: Run tests
         uses: ./.github/actions/run-and-record-tests
         with:
           stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
-          provider: ${{ matrix.provider }}
+          setup: ${{ matrix.setup }}
           inference-mode: 'replay'
-          test-suite: ${{ matrix.test-suite }}
+          suite: ${{ matrix.suite }}
diff --git a/.github/workflows/record-integration-tests.yml b/.github/workflows/record-integration-tests.yml
index 01797a54b..65a04f125 100644
--- a/.github/workflows/record-integration-tests.yml
+++ b/.github/workflows/record-integration-tests.yml
@@ -10,19 +10,19 @@ run-name: Run the integration test suite from tests/integration
 on:
   workflow_dispatch:
     inputs:
-      test-provider:
-        description: 'Test against a specific provider'
+      test-setup:
+        description: 'Test against a specific setup'
         type: string
         default: 'ollama'
-      test-suite:
+      suite:
         description: 'Test suite to use: base, responses, vision, etc.'
         type: string
         default: ''
-      test-subdirs:
-        description: 'Comma-separated list of test subdirectories to run; overrides test-suite'
+      subdirs:
+        description: 'Comma-separated list of test subdirectories to run; overrides suite'
         type: string
         default: ''
-      test-pattern:
+      pattern:
         description: 'Regex pattern to pass to pytest -k'
         type: string
         default: ''
@@ -39,10 +39,10 @@ jobs:
         run: |
           echo "::group::Workflow Inputs"
           echo "branch: ${{ github.ref_name }}"
-          echo "test-provider: ${{ inputs.test-provider }}"
-          echo "test-suite: ${{ inputs.test-suite }}"
-          echo "test-subdirs: ${{ inputs.test-subdirs }}"
-          echo "test-pattern: ${{ inputs.test-pattern }}"
+          echo "test-setup: ${{ inputs.test-setup }}"
+          echo "suite: ${{ inputs.suite }}"
+          echo "subdirs: ${{ inputs.subdirs }}"
+          echo "pattern: ${{ inputs.pattern }}"
           echo "::endgroup::"
 
       - name: Checkout repository
@@ -55,16 +55,16 @@ jobs:
         with:
           python-version: "3.12"  # Use single Python version for recording
           client-version: "latest"
-          provider: ${{ inputs.test-provider || 'ollama' }}
-          test-suite: ${{ inputs.test-suite }}
+          setup: ${{ inputs.test-setup || 'ollama' }}
+          suite: ${{ inputs.suite }}
           inference-mode: 'record'
 
       - name: Run and record tests
         uses: ./.github/actions/run-and-record-tests
         with:
           stack-config: 'server:ci-tests'  # recording must be done with server since more tests are run
-          provider: ${{ inputs.test-provider || 'ollama' }}
+          setup: ${{ inputs.test-setup || 'ollama' }}
           inference-mode: 'record'
-          test-suite: ${{ inputs.test-suite }}
-          test-subdirs: ${{ inputs.test-subdirs }}
-          test-pattern: ${{ inputs.test-pattern }}
+          suite: ${{ inputs.suite }}
+          subdirs: ${{ inputs.subdirs }}
+          pattern: ${{ inputs.pattern }}
diff --git a/scripts/get_setup_env.py b/scripts/get_setup_env.py
new file mode 100644
index 000000000..fad601e76
--- /dev/null
+++ b/scripts/get_setup_env.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Small helper script to extract environment variables from a test setup.
+Used by integration-tests.sh to set environment variables before starting the server.
+"""
+
+import argparse
+import sys
+
+from tests.integration.suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS
+
+
+def get_setup_env_vars(setup_name, suite_name=None):
+    """
+    Get environment variables for a setup, with optional suite default fallback.
+
+    Args:
+        setup_name: Name of the setup (e.g., 'ollama', 'gpt')
+        suite_name: Optional suite name to get default setup if setup_name is None
+
+    Returns:
+        Dictionary of environment variables
+    """
+    # If no setup specified, try to get default from suite
+    if not setup_name and suite_name:
+        suite = SUITE_DEFINITIONS.get(suite_name)
+        if suite and suite.default_setup:
+            setup_name = suite.default_setup
+
+    if not setup_name:
+        return {}
+
+    setup = SETUP_DEFINITIONS.get(setup_name)
+    if not setup:
+        print(
+            f"Error: Unknown setup '{setup_name}'. Available: {', '.join(sorted(SETUP_DEFINITIONS.keys()))}",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    return setup.env
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Extract environment variables from a test setup")
+    parser.add_argument("--setup", help="Setup name (e.g., ollama, gpt)")
+    parser.add_argument("--suite", help="Suite name to get default setup from if --setup not provided")
+    parser.add_argument("--format", choices=["bash", "json"], default="bash", help="Output format (default: bash)")
+
+    args = parser.parse_args()
+
+    env_vars = get_setup_env_vars(args.setup, args.suite)
+
+    if args.format == "bash":
+        # Output as bash export statements
+        for key, value in env_vars.items():
+            print(f"export {key}='{value}'")
+    elif args.format == "json":
+        import json
+
+        print(json.dumps(env_vars))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/github/schedule-record-workflow.sh b/scripts/github/schedule-record-workflow.sh
index 09e055611..c292e53e6 100755
--- a/scripts/github/schedule-record-workflow.sh
+++ b/scripts/github/schedule-record-workflow.sh
@@ -14,7 +14,7 @@ set -euo pipefail
 # Default values
 BRANCH=""
 TEST_SUBDIRS=""
-TEST_PROVIDER="ollama"
+TEST_SETUP="ollama"
 TEST_SUITE="base"
 TEST_PATTERN=""
 
@@ -27,24 +27,24 @@ Trigger the integration test recording workflow remotely. This way you do not ne
 
 OPTIONS:
     -b, --branch BRANCH         Branch to run the workflow on (defaults to current branch)
-    -p, --test-provider PROVIDER Test provider to use: vllm or ollama (default: ollama)
-    -t, --test-suite SUITE      Test suite to use: base, responses, vision, etc. (default: base)
-    -s, --test-subdirs DIRS     Comma-separated list of test subdirectories to run (overrides suite)
-    -k, --test-pattern PATTERN  Regex pattern to pass to pytest -k
+    -t, --suite SUITE           Test suite to use: base, responses, vision, etc. (default: base)
+    -p, --setup SETUP           Test setup to use: vllm, ollama, gpt, etc. (default: ollama)
+    -s, --subdirs DIRS          Comma-separated list of test subdirectories to run (overrides suite)
+    -k, --pattern PATTERN       Regex pattern to pass to pytest -k
     -h, --help                  Show this help message
 
 EXAMPLES:
     # Record tests for current branch with agents subdirectory
-    $0 --test-subdirs "agents"
+    $0 --subdirs "agents"
 
     # Record tests for specific branch with vision tests
-    $0 -b my-feature-branch --test-suite vision
+    $0 -b my-feature-branch --suite vision
 
-    # Record multiple test subdirectories with specific provider
-    $0 --test-subdirs "agents,inference" --test-provider vllm
+    # Record multiple test subdirectories with specific setup
+    $0 --subdirs "agents,inference" --setup vllm
 
     # Record tests matching a specific pattern
-    $0 --test-subdirs "inference" --test-pattern "test_streaming"
+    $0 --subdirs "inference" --pattern "test_streaming"
 
 EOF
 }
@@ -63,19 +63,19 @@ while [[ $# -gt 0 ]]; do
             BRANCH="$2"
             shift 2
             ;;
-        -s|--test-subdirs)
+        -s|--subdirs)
             TEST_SUBDIRS="$2"
             shift 2
             ;;
-        -p|--test-provider)
-            TEST_PROVIDER="$2"
+        -p|--setup)
+            TEST_SETUP="$2"
             shift 2
             ;;
-        -t|--test-suite)
+        -t|--suite)
             TEST_SUITE="$2"
             shift 2
             ;;
-        -k|--test-pattern)
+        -k|--pattern)
             TEST_PATTERN="$2"
             shift 2
             ;;
@@ -93,21 +93,16 @@ done
 
 # Validate required parameters
 if [[ -z "$TEST_SUBDIRS" && -z "$TEST_SUITE" ]]; then
-    echo "Error: --test-subdirs or --test-suite is required"
+    echo "Error: --subdirs or --suite is required"
     echo "Please specify which test subdirectories to run or test suite to use, e.g.:"
-    echo "  $0 --test-subdirs \"agents,inference\""
-    echo "  $0 --test-suite vision"
+    echo "  $0 --subdirs \"agents,inference\""
+    echo "  $0 --suite vision"
     echo ""
     exit 1
 fi
 
-# Validate test provider
-if [[ "$TEST_PROVIDER" != "vllm" && "$TEST_PROVIDER" != "ollama" ]]; then
-    echo "❌ Error: Invalid test provider '$TEST_PROVIDER'"
-    echo "   Supported providers: vllm, ollama"
-    echo "   Example: $0 --test-subdirs \"agents\" --test-provider vllm"
-    exit 1
-fi
+# Validate test setup (optional - setups are validated by the workflow itself)
+# Common setups: ollama, vllm, gpt, etc.
 
 # Check if required tools are installed
 if ! command -v gh &> /dev/null; then
@@ -237,7 +232,7 @@ fi
 # Build the workflow dispatch command
 echo "Triggering integration test recording workflow..."
 echo "Branch: $BRANCH"
-echo "Test provider: $TEST_PROVIDER"
+echo "Test setup: $TEST_SETUP"
 echo "Test subdirs: $TEST_SUBDIRS"
 echo "Test suite: $TEST_SUITE"
 echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
@@ -245,16 +240,16 @@ echo ""
 
 # Prepare inputs for gh workflow run
 if [[ -n "$TEST_SUBDIRS" ]]; then
-    INPUTS="-f test-subdirs='$TEST_SUBDIRS'"
+    INPUTS="-f subdirs='$TEST_SUBDIRS'"
 fi
-if [[ -n "$TEST_PROVIDER" ]]; then
-    INPUTS="$INPUTS -f test-provider='$TEST_PROVIDER'"
+if [[ -n "$TEST_SETUP" ]]; then
+    INPUTS="$INPUTS -f test-setup='$TEST_SETUP'"
 fi
 if [[ -n "$TEST_SUITE" ]]; then
-    INPUTS="$INPUTS -f test-suite='$TEST_SUITE'"
+    INPUTS="$INPUTS -f suite='$TEST_SUITE'"
 fi
 if [[ -n "$TEST_PATTERN" ]]; then
-    INPUTS="$INPUTS -f test-pattern='$TEST_PATTERN'"
+    INPUTS="$INPUTS -f pattern='$TEST_PATTERN'"
 fi
 
 # Run the workflow
diff --git a/scripts/integration-tests.sh b/scripts/integration-tests.sh
index ab7e37579..eee60951d 100755
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@@ -13,10 +13,10 @@ set -euo pipefail
 
 # Default values
 STACK_CONFIG=""
-PROVIDER=""
+TEST_SUITE="base"
+TEST_SETUP=""
 TEST_SUBDIRS=""
 TEST_PATTERN=""
-TEST_SUITE="base"
 INFERENCE_MODE="replay"
 EXTRA_PARAMS=""
 
@@ -27,29 +27,30 @@ Usage: $0 [OPTIONS]
 
 Options:
     --stack-config STRING    Stack configuration to use (required)
-    --provider STRING        Provider to use (ollama, vllm, etc.) (required)
-    --test-suite STRING      Comma-separated list of test suites to run (default: 'base')
+    --suite STRING           Test suite to run (default: 'base')
+    --setup STRING           Test setup (models, env) to use (e.g., 'ollama', 'ollama-vision', 'gpt', 'vllm')
     --inference-mode STRING  Inference mode: record or replay (default: replay)
-    --test-subdirs STRING    Comma-separated list of test subdirectories to run (overrides suite)
-    --test-pattern STRING    Regex pattern to pass to pytest -k
+    --subdirs STRING         Comma-separated list of test subdirectories to run (overrides suite)
+    --pattern STRING         Regex pattern to pass to pytest -k
     --help                   Show this help message
 
-Suites are defined in tests/integration/suites.py. They are used to narrow the collection of tests and provide default model options.
+Suites are defined in tests/integration/suites.py and define which tests to run.
+Setups are defined in tests/integration/setups.py and provide global configuration (models, env).
 
 You can also specify subdirectories (of tests/integration) to select tests from, which will override the suite.
 
 Examples:
     # Basic inference tests with ollama
-    $0 --stack-config server:ci-tests --provider ollama
+    $0 --stack-config server:ci-tests --suite base --setup ollama
 
     # Multiple test directories with vllm
-    $0 --stack-config server:ci-tests --provider vllm --test-subdirs 'inference,agents'
+    $0 --stack-config server:ci-tests --subdirs 'inference,agents' --setup vllm
 
     # Vision tests with ollama
-    $0 --stack-config server:ci-tests --provider ollama --test-suite vision
+    $0 --stack-config server:ci-tests --suite vision  # default setup for this suite is ollama-vision
 
     # Record mode for updating test recordings
-    $0 --stack-config server:ci-tests --provider ollama --inference-mode record
+    $0 --stack-config server:ci-tests --suite base --inference-mode record
 EOF
 }
 
@@ -60,15 +61,15 @@ while [[ $# -gt 0 ]]; do
             STACK_CONFIG="$2"
             shift 2
             ;;
-        --provider)
-            PROVIDER="$2"
+        --setup)
+            TEST_SETUP="$2"
             shift 2
             ;;
-        --test-subdirs)
+        --subdirs)
             TEST_SUBDIRS="$2"
             shift 2
             ;;
-        --test-suite)
+        --suite)
             TEST_SUITE="$2"
             shift 2
             ;;
@@ -76,7 +77,7 @@ while [[ $# -gt 0 ]]; do
             INFERENCE_MODE="$2"
             shift 2
             ;;
-        --test-pattern)
+        --pattern)
             TEST_PATTERN="$2"
             shift 2
             ;;
@@ -96,11 +97,13 @@ done
 # Validate required parameters
 if [[ -z "$STACK_CONFIG" ]]; then
     echo "Error: --stack-config is required"
+    usage
     exit 1
 fi
 
-if [[ -z "$PROVIDER" ]]; then
-    echo "Error: --provider is required"
+if [[ -z "$TEST_SETUP" && -n "$TEST_SUBDIRS" ]]; then
+    echo "Error: --test-setup is required when --test-subdirs is provided"
+    usage
     exit 1
 fi
 
@@ -111,7 +114,7 @@ fi
 
 echo "=== Llama Stack Integration Test Runner ==="
 echo "Stack Config: $STACK_CONFIG"
-echo "Provider: $PROVIDER"
+echo "Setup: $TEST_SETUP"
 echo "Inference Mode: $INFERENCE_MODE"
 echo "Test Suite: $TEST_SUITE"
 echo "Test Subdirs: $TEST_SUBDIRS"
@@ -129,21 +132,25 @@ echo ""
 
 # Set environment variables
 export LLAMA_STACK_CLIENT_TIMEOUT=300
-export LLAMA_STACK_TEST_INFERENCE_MODE="$INFERENCE_MODE"
-
-# Configure provider-specific settings
-if [[ "$PROVIDER" == "ollama" ]]; then
-    export OLLAMA_URL="http://0.0.0.0:11434"
-    export TEXT_MODEL="ollama/llama3.2:3b-instruct-fp16"
-    export SAFETY_MODEL="ollama/llama-guard3:1b"
-    EXTRA_PARAMS="--safety-shield=llama-guard"
-else
-    export VLLM_URL="http://localhost:8000/v1"
-    export TEXT_MODEL="vllm/meta-llama/Llama-3.2-1B-Instruct"
-    EXTRA_PARAMS=""
-fi
 
 THIS_DIR=$(dirname "$0")
+
+if [[ -n "$TEST_SETUP" ]]; then
+    EXTRA_PARAMS="--setup=$TEST_SETUP"
+fi
+
+# Apply setup-specific environment variables (needed for server startup and tests)
+echo "=== Applying Setup Environment Variables ==="
+
+# the server needs this
+export LLAMA_STACK_TEST_INFERENCE_MODE="$INFERENCE_MODE"
+
+SETUP_ENV=$(PYTHONPATH=$THIS_DIR/.. python "$THIS_DIR/get_setup_env.py" --suite "$TEST_SUITE" --setup "$TEST_SETUP" --format bash)
+echo "Setting up environment variables:"
+echo "$SETUP_ENV"
+eval "$SETUP_ENV"
+echo ""
+
 ROOT_DIR="$THIS_DIR/.."
 cd $ROOT_DIR
 
@@ -162,6 +169,18 @@ fi
 
 # Start Llama Stack Server if needed
 if [[ "$STACK_CONFIG" == *"server:"* ]]; then
+    stop_server() {
+        echo "Stopping Llama Stack Server..."
+        pids=$(lsof -i :8321 | awk 'NR>1 {print $2}')
+        if [[ -n "$pids" ]]; then
+            echo "Killing Llama Stack Server processes: $pids"
+            kill -9 $pids
+        else
+            echo "No Llama Stack Server processes found ?!"
+        fi
+        echo "Llama Stack Server stopped"
+    }
+
     # check if server is already running
     if curl -s http://localhost:8321/v1/health 2>/dev/null | grep -q "OK"; then
         echo "Llama Stack Server is already running, skipping start"
@@ -185,14 +204,16 @@ if [[ "$STACK_CONFIG" == *"server:"* ]]; then
         done
         echo ""
     fi
+
+    trap stop_server EXIT ERR INT TERM
 fi
 
 # Run tests
 echo "=== Running Integration Tests ==="
 EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
 
-# Additional exclusions for vllm provider
-if [[ "$PROVIDER" == "vllm" ]]; then
+# Additional exclusions for vllm setup
+if [[ "$TEST_SETUP" == "vllm" ]]; then
     EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
 fi
 
@@ -229,20 +250,22 @@ if [[ -n "$TEST_SUBDIRS" ]]; then
     echo "Total test files: $(echo $TEST_FILES | wc -w)"
 
     PYTEST_TARGET="$TEST_FILES"
-    EXTRA_PARAMS="$EXTRA_PARAMS --text-model=$TEXT_MODEL --embedding-model=sentence-transformers/all-MiniLM-L6-v2"
 else
     PYTEST_TARGET="tests/integration/"
     EXTRA_PARAMS="$EXTRA_PARAMS --suite=$TEST_SUITE"
 fi
 
 set +e
+set -x
 pytest -s -v $PYTEST_TARGET \
     --stack-config="$STACK_CONFIG" \
+    --inference-mode="$INFERENCE_MODE" \
     -k "$PYTEST_PATTERN" \
     $EXTRA_PARAMS \
     --color=yes \
     --capture=tee-sys
 exit_code=$?
+set +x
 set -e
 
 if [ $exit_code -eq 0 ]; then
@@ -260,18 +283,5 @@ echo "=== System Resources After Tests ==="
 free -h 2>/dev/null || echo "free command not available"
 df -h
 
-# stop server
-if [[ "$STACK_CONFIG" == *"server:"* ]]; then
-    echo "Stopping Llama Stack Server..."
-    pids=$(lsof -i :8321 | awk 'NR>1 {print $2}')
-    if [[ -n "$pids" ]]; then
-        echo "Killing Llama Stack Server processes: $pids"
-        kill -9 $pids
-    else
-        echo "No Llama Stack Server processes found ?!"
-    fi
-    echo "Llama Stack Server stopped"
-fi
-
 echo ""
 echo "=== Integration Tests Complete ==="
diff --git a/tests/integration/README.md b/tests/integration/README.md
index b05beeb98..467f97e02 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -6,9 +6,7 @@ Integration tests verify complete workflows across different providers using Lla
 
 ```bash
 # Run all integration tests with existing recordings
-LLAMA_STACK_TEST_INFERENCE_MODE=replay \
-  LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
-  uv run --group test \
+uv run --group test \
   pytest -sv tests/integration/ --stack-config=starter
 ```
 
@@ -42,25 +40,35 @@ Model parameters can be influenced by the following options:
 Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
 if no model is specified.
 
-### Suites (fast selection + sane defaults)
+### Suites and Setups
 
-- `--suite`: comma-separated list of named suites that both narrow which tests are collected and prefill common model options (unless you pass them explicitly).
+- `--suite`: single named suite that narrows which tests are collected.
 - Available suites:
-  - `responses`: collects tests under `tests/integration/responses`; this is a separate suite because it needs a strong tool-calling model.
-  - `vision`: collects only `tests/integration/inference/test_vision_inference.py`; defaults `--vision-model=ollama/llama3.2-vision:11b`, `--embedding-model=sentence-transformers/all-MiniLM-L6-v2`.
-- Explicit flags always win. For example, `--suite=responses --text-model=<X>` overrides the suite’s text model.
+  - `base`: collects most tests (excludes responses and post_training)
+  - `responses`: collects tests under `tests/integration/responses` (needs strong tool-calling models)
+  - `vision`: collects only `tests/integration/inference/test_vision_inference.py`
+- `--setup`: global configuration that can be used with any suite. Setups prefill model/env defaults; explicit CLI flags always win.
+  - Available setups:
+    - `ollama`: Local Ollama provider with lightweight models (sets OLLAMA_URL, uses llama3.2:3b-instruct-fp16)
+    - `vllm`: VLLM provider for efficient local inference (sets VLLM_URL, uses Llama-3.2-1B-Instruct)
+    - `gpt`: OpenAI GPT models for high-quality responses (uses gpt-4o)
+    - `claude`: Anthropic Claude models for high-quality responses (uses claude-3-5-sonnet)
 
-Examples:
+Examples
 
 ```bash
-# Fast responses run with defaults
-pytest -s -v tests/integration --stack-config=server:starter --suite=responses
+# Fast responses run with a strong tool-calling model
+pytest -s -v tests/integration --stack-config=server:starter --suite=responses --setup=gpt
 
-# Fast single-file vision run with defaults
-pytest -s -v tests/integration --stack-config=server:starter --suite=vision
+# Fast single-file vision run with Ollama defaults
+pytest -s -v tests/integration --stack-config=server:starter --suite=vision --setup=ollama
 
-# Combine suites and override a default
-pytest -s -v tests/integration --stack-config=server:starter --suite=responses,vision --embedding-model=text-embedding-3-small
+# Base suite with VLLM for performance
+pytest -s -v tests/integration --stack-config=server:starter --suite=base --setup=vllm
+
+# Override a default from setup
+pytest -s -v tests/integration --stack-config=server:starter \
+  --suite=responses --setup=gpt --embedding-model=text-embedding-3-small
 ```
 
 ## Examples
@@ -127,14 +135,13 @@ pytest tests/integration/
 ### RECORD Mode
 Captures API interactions for later replay:
 ```bash
-LLAMA_STACK_TEST_INFERENCE_MODE=record \
-pytest tests/integration/inference/test_new_feature.py
+pytest tests/integration/inference/test_new_feature.py --inference-mode=record
 ```
 
 ### LIVE Mode
 Tests make real API calls (but not recorded):
 ```bash
-LLAMA_STACK_TEST_INFERENCE_MODE=live pytest tests/integration/
+pytest tests/integration/ --inference-mode=live
 ```
 
 By default, the recording directory is `tests/integration/recordings`. You can override this by setting the `LLAMA_STACK_TEST_RECORDING_DIR` environment variable.
@@ -155,15 +162,14 @@ cat recordings/responses/abc123.json | jq '.'
 #### Remote Re-recording (Recommended)
 Use the automated workflow script for easier re-recording:
 ```bash
-./scripts/github/schedule-record-workflow.sh --test-subdirs "inference,agents"
+./scripts/github/schedule-record-workflow.sh --subdirs "inference,agents"
 ```
 See the [main testing guide](../README.md#remote-re-recording-recommended) for full details.
 
 #### Local Re-recording
 ```bash
 # Re-record specific tests
-LLAMA_STACK_TEST_INFERENCE_MODE=record \
-pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py
+pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py --inference-mode=record
 ```
 
 Note that when re-recording tests, you must use a Stack pointing to a server (i.e., `server:starter`). This subtlety exists because the set of tests run in server are a superset of the set of tests run in the library client.
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 96260fdb7..4735264c3 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -15,7 +15,7 @@ from dotenv import load_dotenv
 
 from llama_stack.log import get_logger
 
-from .suites import SUITE_DEFINITIONS
+from .suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS
 
 logger = get_logger(__name__, category="tests")
 
@@ -63,19 +63,33 @@ def pytest_configure(config):
         key, value = env_var.split("=", 1)
         os.environ[key] = value
 
-    suites_raw = config.getoption("--suite")
-    suites: list[str] = []
-    if suites_raw:
-        suites = [p.strip() for p in str(suites_raw).split(",") if p.strip()]
-        unknown = [p for p in suites if p not in SUITE_DEFINITIONS]
-        if unknown:
+    inference_mode = config.getoption("--inference-mode")
+    os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = inference_mode
+
+    suite = config.getoption("--suite")
+    if suite:
+        if suite not in SUITE_DEFINITIONS:
+            raise pytest.UsageError(f"Unknown suite: {suite}. Available: {', '.join(sorted(SUITE_DEFINITIONS.keys()))}")
+
+    # Apply setups (global parameterizations): env + defaults
+    setup = config.getoption("--setup")
+    if suite and not setup:
+        setup = SUITE_DEFINITIONS[suite].default_setup
+
+    if setup:
+        if setup not in SETUP_DEFINITIONS:
             raise pytest.UsageError(
-                f"Unknown suite(s): {', '.join(unknown)}. Available: {', '.join(sorted(SUITE_DEFINITIONS.keys()))}"
+                f"Unknown setup '{setup}'. Available: {', '.join(sorted(SETUP_DEFINITIONS.keys()))}"
             )
-    for suite in suites:
-        suite_def = SUITE_DEFINITIONS.get(suite, {})
-        defaults: dict = suite_def.get("defaults", {})
-        for dest, value in defaults.items():
+
+        setup_obj = SETUP_DEFINITIONS[setup]
+        logger.info(f"Applying setup '{setup}'{' for suite ' + suite if suite else ''}")
+        # Apply env first
+        for k, v in setup_obj.env.items():
+            if k not in os.environ:
+                os.environ[k] = str(v)
+        # Apply defaults if not provided explicitly
+        for dest, value in setup_obj.defaults.items():
             current = getattr(config.option, dest, None)
             if not current:
                 setattr(config.option, dest, value)
@@ -120,6 +134,13 @@ def pytest_addoption(parser):
         default=384,
         help="Output dimensionality of the embedding model to use for testing. Default: 384",
     )
+
+    parser.addoption(
+        "--inference-mode",
+        help="Inference mode: { record, replay, live } (default: replay)",
+        choices=["record", "replay", "live"],
+        default="replay",
+    )
     parser.addoption(
         "--report",
         help="Path where the test report should be written, e.g. --report=/path/to/report.md",
@@ -127,14 +148,18 @@ def pytest_addoption(parser):
 
     available_suites = ", ".join(sorted(SUITE_DEFINITIONS.keys()))
     suite_help = (
-        "Comma-separated integration test suites to narrow collection and prefill defaults. "
-        "Available: "
-        f"{available_suites}. "
-        "Explicit CLI flags (e.g., --text-model) override suite defaults. "
-        "Examples: --suite=responses or --suite=responses,vision."
+        f"Single test suite to run (narrows collection). Available: {available_suites}. Example: --suite=responses"
     )
     parser.addoption("--suite", help=suite_help)
 
+    # Global setups for any suite
+    available_setups = ", ".join(sorted(SETUP_DEFINITIONS.keys()))
+    setup_help = (
+        f"Global test setup configuration. Available: {available_setups}. "
+        "Can be used with any suite. Example: --setup=ollama"
+    )
+    parser.addoption("--setup", help=setup_help)
+
 
 MODEL_SHORT_IDS = {
     "meta-llama/Llama-3.2-3B-Instruct": "3B",
@@ -221,16 +246,12 @@ pytest_plugins = ["tests.integration.fixtures.common"]
 
 def pytest_ignore_collect(path: str, config: pytest.Config) -> bool:
     """Skip collecting paths outside the selected suite roots for speed."""
-    suites_raw = config.getoption("--suite")
-    if not suites_raw:
+    suite = config.getoption("--suite")
+    if not suite:
         return False
 
-    names = [p.strip() for p in str(suites_raw).split(",") if p.strip()]
-    roots: list[str] = []
-    for name in names:
-        suite_def = SUITE_DEFINITIONS.get(name)
-        if suite_def:
-            roots.extend(suite_def.get("roots", []))
+    sobj = SUITE_DEFINITIONS.get(suite)
+    roots: list[str] = sobj.get("roots", []) if isinstance(sobj, dict) else getattr(sobj, "roots", [])
     if not roots:
         return False
 
diff --git a/tests/integration/suites.py b/tests/integration/suites.py
index 602855055..bacd7ef52 100644
--- a/tests/integration/suites.py
+++ b/tests/integration/suites.py
@@ -8,46 +8,112 @@
 # For example:
 #
 # ```bash
-# pytest tests/integration/ --suite=vision
+# pytest tests/integration/ --suite=vision --setup=ollama
 # ```
 #
-# Each suite can:
-# - restrict collection to specific roots (dirs or files)
-# - provide default CLI option values (e.g. text_model, embedding_model, etc.)
+"""
+Each suite defines what to run (roots). Suites can be run with different global setups defined in setups.py.
+Setups provide environment variables and model defaults that can be reused across multiple suites.
+
+CLI examples:
+  pytest tests/integration --suite=responses --setup=gpt
+  pytest tests/integration --suite=vision --setup=ollama
+  pytest tests/integration --suite=base --setup=vllm
+"""
 
 from pathlib import Path
 
+from pydantic import BaseModel, Field
+
 this_dir = Path(__file__).parent
-default_roots = [
+
+
+class Suite(BaseModel):
+    name: str
+    roots: list[str]
+    default_setup: str | None = None
+
+
+class Setup(BaseModel):
+    """A reusable test configuration with environment and CLI defaults."""
+
+    name: str
+    description: str
+    defaults: dict[str, str] = Field(default_factory=dict)
+    env: dict[str, str] = Field(default_factory=dict)
+
+
+# Global setups - can be used with any suite "technically" but in reality, some setups might work
+# only for specific test suites.
+SETUP_DEFINITIONS: dict[str, Setup] = {
+    "ollama": Setup(
+        name="ollama",
+        description="Local Ollama provider with text + safety models",
+        env={
+            "OLLAMA_URL": "http://0.0.0.0:11434",
+            "SAFETY_MODEL": "ollama/llama-guard3:1b",
+        },
+        defaults={
+            "text_model": "ollama/llama3.2:3b-instruct-fp16",
+            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
+            "safety_model": "ollama/llama-guard3:1b",
+            "safety_shield": "llama-guard",
+        },
+    ),
+    "ollama-vision": Setup(
+        name="ollama",
+        description="Local Ollama provider with a vision model",
+        env={
+            "OLLAMA_URL": "http://0.0.0.0:11434",
+        },
+        defaults={
+            "vision_model": "ollama/llama3.2-vision:11b",
+            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
+        },
+    ),
+    "vllm": Setup(
+        name="vllm",
+        description="vLLM provider with a text model",
+        env={
+            "VLLM_URL": "http://localhost:8000/v1",
+        },
+        defaults={
+            "text_model": "vllm/meta-llama/Llama-3.2-1B-Instruct",
+            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
+        },
+    ),
+    "gpt": Setup(
+        name="gpt",
+        description="OpenAI GPT models for high-quality responses and tool calling",
+        defaults={
+            "text_model": "openai/gpt-4o",
+            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
+        },
+    ),
+}
+
+
+base_roots = [
     str(p)
     for p in this_dir.glob("*")
     if p.is_dir()
     and p.name not in ("__pycache__", "fixtures", "test_cases", "recordings", "responses", "post_training")
 ]
 
-SUITE_DEFINITIONS: dict[str, dict] = {
-    "base": {
-        "description": "Base suite that includes most tests but runs them with a text Ollama model",
-        "roots": default_roots,
-        "defaults": {
-            "text_model": "ollama/llama3.2:3b-instruct-fp16",
-            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
-        },
-    },
-    "responses": {
-        "description": "Suite that includes only the OpenAI Responses tests; needs a strong tool-calling model",
-        "roots": ["tests/integration/responses"],
-        "defaults": {
-            "text_model": "openai/gpt-4o",
-            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
-        },
-    },
-    "vision": {
-        "description": "Suite that includes only the vision tests",
-        "roots": ["tests/integration/inference/test_vision_inference.py"],
-        "defaults": {
-            "vision_model": "ollama/llama3.2-vision:11b",
-            "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
-        },
-    },
+SUITE_DEFINITIONS: dict[str, Suite] = {
+    "base": Suite(
+        name="base",
+        roots=base_roots,
+        default_setup="ollama",
+    ),
+    "responses": Suite(
+        name="responses",
+        roots=["tests/integration/responses"],
+        default_setup="gpt",
+    ),
+    "vision": Suite(
+        name="vision",
+        roots=["tests/integration/inference/test_vision_inference.py"],
+        default_setup="ollama-vision",
+    ),
 }

From 9d3a234bf3772083e148d8168a204b9cb2c200ac Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Tue, 9 Sep 2025 15:51:20 -0700
Subject: [PATCH 02/15] chore: remove unused variable (#3389)

# What does this PR do?


## Test Plan
---
 llama_stack/core/library_client.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llama_stack/core/library_client.py b/llama_stack/core/library_client.py
index 9e7a8006c..ea5a2ac8e 100644
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@@ -10,7 +10,6 @@ import json
 import logging  # allow-direct-logging
 import os
 import sys
-from concurrent.futures import ThreadPoolExecutor
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
@@ -148,7 +147,6 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
         self.async_client = AsyncLlamaStackAsLibraryClient(
             config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal
         )
-        self.pool_executor = ThreadPoolExecutor(max_workers=4)
         self.provider_data = provider_data
 
         self.loop = asyncio.new_event_loop()

From dd1f946b3ee4232dc8e13d3836e7f19e65f5e112 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 9 Sep 2025 18:54:58 -0400
Subject: [PATCH 03/15] feat: include a default inference store during llama
 stack build (#3373)

# What does this PR do?

enables completions storage when using `llama stack build --providers` -
 - GET /v1/chat/completions
 - GET /v1/chat/completions/{id}

todo: llama stack build and distro codegen should use the same code
paths

## Test Plan

ci
---
 llama_stack/cli/stack/_build.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llama_stack/cli/stack/_build.py b/llama_stack/cli/stack/_build.py
index c6e204773..b14e6fe55 100644
--- a/llama_stack/cli/stack/_build.py
+++ b/llama_stack/cli/stack/_build.py
@@ -45,6 +45,7 @@ from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.core.utils.exec import formulate_run_args, run_command
 from llama_stack.core.utils.image_types import LlamaStackImageType
 from llama_stack.providers.datatypes import Api
+from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
 
 DISTRIBS_PATH = Path(__file__).parent.parent.parent / "distributions"
 
@@ -294,6 +295,12 @@ def _generate_run_config(
         if build_config.external_providers_dir
         else EXTERNAL_PROVIDERS_DIR,
     )
+    if not run_config.inference_store:
+        run_config.inference_store = SqliteSqlStoreConfig(
+            **SqliteSqlStoreConfig.sample_run_config(
+                __distro_dir__=(DISTRIBS_BASE_DIR / image_name).as_posix(), db_name="inference_store.db"
+            )
+        )
     # build providers dict
     provider_registry = get_provider_registry(build_config)
     for api in apis:

From 81ad240faa48d2a2d91e5fbfc3dda21443432a6f Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 9 Sep 2025 23:00:50 -0700
Subject: [PATCH 04/15] fix(k8s): unwedge run.yaml to add files

---
 .../k8s-benchmark/stack-configmap.yaml        |  19 +-
 .../k8s-benchmark/stack_run_config.yaml       |   9 +
 .../distributions/k8s/stack-configmap.yaml    | 182 +++++-------------
 .../distributions/k8s/stack_run_config.yaml   |   9 +
 4 files changed, 77 insertions(+), 142 deletions(-)

diff --git a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
index edf4ebd75..bf6109b68 100644
--- a/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack-configmap.yaml
@@ -6,6 +6,7 @@ data:
     apis:
     - agents
     - inference
+    - files
     - safety
     - telemetry
     - tool_runtime
@@ -19,13 +20,6 @@ data:
           max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
           api_token: ${env.VLLM_API_TOKEN:=fake}
           tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: vllm-safety
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
       - provider_id: sentence-transformers
         provider_type: inline::sentence-transformers
         config: {}
@@ -41,6 +35,14 @@ data:
             db: ${env.POSTGRES_DB:=llamastack}
             user: ${env.POSTGRES_USER:=llamastack}
             password: ${env.POSTGRES_PASSWORD:=llamastack}
+      files:
+      - provider_id: meta-reference-files
+        provider_type: inline::localfs
+        config:
+          storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+          metadata_store:
+            type: sqlite
+            db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
       safety:
       - provider_id: llama-guard
         provider_type: inline::llama-guard
@@ -111,9 +113,6 @@ data:
     - model_id: ${env.INFERENCE_MODEL}
       provider_id: vllm-inference
       model_type: llm
-    - model_id: ${env.SAFETY_MODEL}
-      provider_id: vllm-safety
-      model_type: llm
     shields:
     - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
     vector_dbs: []
diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
index 5a810639e..f8ff7811b 100644
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@@ -3,6 +3,7 @@ image_name: kubernetes-benchmark-demo
 apis:
 - agents
 - inference
+- files
 - safety
 - telemetry
 - tool_runtime
@@ -31,6 +32,14 @@ providers:
         db: ${env.POSTGRES_DB:=llamastack}
         user: ${env.POSTGRES_USER:=llamastack}
         password: ${env.POSTGRES_PASSWORD:=llamastack}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
diff --git a/docs/source/distributions/k8s/stack-configmap.yaml b/docs/source/distributions/k8s/stack-configmap.yaml
index 4f95554e3..3dbb0da97 100644
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@@ -1,137 +1,55 @@
 apiVersion: v1
 data:
-  stack_run_config.yaml: |
-    version: '2'
-    image_name: kubernetes-demo
-    apis:
-    - agents
-    - inference
-    - safety
-    - telemetry
-    - tool_runtime
-    - vector_io
-    providers:
-      inference:
-      - provider_id: vllm-inference
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: vllm-safety
-        provider_type: remote::vllm
-        config:
-          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
-      - provider_id: sentence-transformers
-        provider_type: inline::sentence-transformers
-        config: {}
-      vector_io:
-      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
-        provider_type: remote::chromadb
-        config:
-          url: ${env.CHROMADB_URL:=}
-          kvstore:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      safety:
-      - provider_id: llama-guard
-        provider_type: inline::llama-guard
-        config:
-          excluded_categories: []
-      agents:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          persistence_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-          responses_store:
-            type: postgres
-            host: ${env.POSTGRES_HOST:=localhost}
-            port: ${env.POSTGRES_PORT:=5432}
-            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:=llamastack}
-            password: ${env.POSTGRES_PASSWORD:=llamastack}
-      telemetry:
-      - provider_id: meta-reference
-        provider_type: inline::meta-reference
-        config:
-          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-          sinks: ${env.TELEMETRY_SINKS:=console}
-      tool_runtime:
-      - provider_id: brave-search
-        provider_type: remote::brave-search
-        config:
-          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: tavily-search
-        provider_type: remote::tavily-search
-        config:
-          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
-          max_results: 3
-      - provider_id: rag-runtime
-        provider_type: inline::rag-runtime
-        config: {}
-      - provider_id: model-context-protocol
-        provider_type: remote::model-context-protocol
-        config: {}
-    metadata_store:
-      type: postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-      table_name: llamastack_kvstore
-    inference_store:
-      type: postgres
-      host: ${env.POSTGRES_HOST:=localhost}
-      port: ${env.POSTGRES_PORT:=5432}
-      db: ${env.POSTGRES_DB:=llamastack}
-      user: ${env.POSTGRES_USER:=llamastack}
-      password: ${env.POSTGRES_PASSWORD:=llamastack}
-    models:
-    - metadata:
-        embedding_dimension: 384
-      model_id: all-MiniLM-L6-v2
-      provider_id: sentence-transformers
-      model_type: embedding
-    - metadata: {}
-      model_id: ${env.INFERENCE_MODEL}
-      provider_id: vllm-inference
-      model_type: llm
-    - metadata: {}
-      model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-      provider_id: vllm-safety
-      model_type: llm
-    shields:
-    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
-    vector_dbs: []
-    datasets: []
-    scoring_fns: []
-    benchmarks: []
-    tool_groups:
-    - toolgroup_id: builtin::websearch
-      provider_id: tavily-search
-    - toolgroup_id: builtin::rag
-      provider_id: rag-runtime
-    server:
-      port: 8321
-      auth:
-        provider_config:
-          type: github_token
+  stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
+    inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
+    \ inference:\n  - provider_id: vllm-inference\n    provider_type: remote::vllm\n
+    \   config:\n      url: ${env.VLLM_URL:=http://localhost:8000/v1}\n      max_tokens:
+    ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n      tls_verify:
+    ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: vllm-safety\n    provider_type:
+    remote::vllm\n    config:\n      url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
+    \     max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n      api_token: ${env.VLLM_API_TOKEN:=fake}\n
+    \     tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n  - provider_id: sentence-transformers\n
+    \   provider_type: inline::sentence-transformers\n    config: {}\n  vector_io:\n
+    \ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n    provider_type: remote::chromadb\n
+    \   config:\n      url: ${env.CHROMADB_URL:=}\n      kvstore:\n        type: postgres\n
+    \       host: ${env.POSTGRES_HOST:=localhost}\n        port: ${env.POSTGRES_PORT:=5432}\n
+    \       db: ${env.POSTGRES_DB:=llamastack}\n        user: ${env.POSTGRES_USER:=llamastack}\n
+    \       password: ${env.POSTGRES_PASSWORD:=llamastack}\n  files:\n  - provider_id:
+    meta-reference-files\n    provider_type: inline::localfs\n    config:\n      storage_dir:
+    ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n      metadata_store:\n
+    \       type: sqlite\n        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+    \ \n  safety:\n  - provider_id: llama-guard\n    provider_type: inline::llama-guard\n
+    \   config:\n      excluded_categories: []\n  agents:\n  - provider_id: meta-reference\n
+    \   provider_type: inline::meta-reference\n    config:\n      persistence_store:\n
+    \       type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n        port:
+    ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n        user:
+    ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+    \     responses_store:\n        type: postgres\n        host: ${env.POSTGRES_HOST:=localhost}\n
+    \       port: ${env.POSTGRES_PORT:=5432}\n        db: ${env.POSTGRES_DB:=llamastack}\n
+    \       user: ${env.POSTGRES_USER:=llamastack}\n        password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+    \ telemetry:\n  - provider_id: meta-reference\n    provider_type: inline::meta-reference\n
+    \   config:\n      service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n      sinks:
+    ${env.TELEMETRY_SINKS:=console}\n  tool_runtime:\n  - provider_id: brave-search\n
+    \   provider_type: remote::brave-search\n    config:\n      api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
+    \     max_results: 3\n  - provider_id: tavily-search\n    provider_type: remote::tavily-search\n
+    \   config:\n      api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n      max_results:
+    3\n  - provider_id: rag-runtime\n    provider_type: inline::rag-runtime\n    config:
+    {}\n  - provider_id: model-context-protocol\n    provider_type: remote::model-context-protocol\n
+    \   config: {}\nmetadata_store:\n  type: postgres\n  host: ${env.POSTGRES_HOST:=localhost}\n
+    \ port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n  user:
+    ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\n
+    \ table_name: llamastack_kvstore\ninference_store:\n  type: postgres\n  host:
+    ${env.POSTGRES_HOST:=localhost}\n  port: ${env.POSTGRES_PORT:=5432}\n  db: ${env.POSTGRES_DB:=llamastack}\n
+    \ user: ${env.POSTGRES_USER:=llamastack}\n  password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
+    metadata:\n    embedding_dimension: 384\n  model_id: all-MiniLM-L6-v2\n  provider_id:
+    sentence-transformers\n  model_type: embedding\n- metadata: {}\n  model_id: ${env.INFERENCE_MODEL}\n
+    \ provider_id: vllm-inference\n  model_type: llm\n- metadata: {}\n  model_id:
+    ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n  provider_id: vllm-safety\n
+    \ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
+    []\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
+    builtin::websearch\n  provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
+    \ provider_id: rag-runtime\nserver:\n  port: 8321\n  auth:\n    provider_config:\n
+    \     type: github_token\n"
 kind: ConfigMap
 metadata:
   creationTimestamp: null
diff --git a/docs/source/distributions/k8s/stack_run_config.yaml b/docs/source/distributions/k8s/stack_run_config.yaml
index a2d65e1a9..b841ab977 100644
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@@ -3,6 +3,7 @@ image_name: kubernetes-demo
 apis:
 - agents
 - inference
+- files
 - safety
 - telemetry
 - tool_runtime
@@ -38,6 +39,14 @@ providers:
         db: ${env.POSTGRES_DB:=llamastack}
         user: ${env.POSTGRES_USER:=llamastack}
         password: ${env.POSTGRES_PASSWORD:=llamastack}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
   safety:
   - provider_id: llama-guard
     provider_type: inline::llama-guard

From 1c23aeb9372fa3e1286a7d6d8210994000efae6d Mon Sep 17 00:00:00 2001
From: Cesare Pompeiano <cesare.pompeiano@gmail.com>
Date: Wed, 10 Sep 2025 11:19:21 +0200
Subject: [PATCH 05/15] feat: Add vector_db_id to chunk metadata (#3304)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

When running RAG in a multi vector DB setting, it can be difficult to
trace where retrieved chunks originate from. This PR adds the
`vector_db_id` into each chunk’s metadata, making it easier to
understand which database a given chunk came from. This is helpful for
debugging and for analyzing retrieval behavior of multiple DBs.

Relevant code:

```python
for vector_db_id, result in zip(vector_db_ids, results):
    for chunk, score in zip(result.chunks, result.scores):
        if not hasattr(chunk, "metadata") or chunk.metadata is None:
            chunk.metadata = {}
        chunk.metadata["vector_db_id"] = vector_db_id

        chunks.append(chunk)
        scores.append(score)
```

## Test Plan

* Ran Llama Stack in debug mode.
* Verified that `vector_db_id` was added to each chunk’s metadata.
* Confirmed that the metadata was printed in the console when using the
RAG tool.

---------

Co-authored-by: are-ces <cpompeia@redhat.com>
Co-authored-by: Francisco Arceo <arceofrancisco@gmail.com>
---
 .../inline/tool_runtime/rag/memory.py         | 16 +++++-
 tests/unit/rag/test_rag_query.py              | 55 +++++++++++++++++++
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/llama_stack/providers/inline/tool_runtime/rag/memory.py b/llama_stack/providers/inline/tool_runtime/rag/memory.py
index cb526e8ee..aa629cca8 100644
--- a/llama_stack/providers/inline/tool_runtime/rag/memory.py
+++ b/llama_stack/providers/inline/tool_runtime/rag/memory.py
@@ -167,8 +167,18 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
             for vector_db_id in vector_db_ids
         ]
         results: list[QueryChunksResponse] = await asyncio.gather(*tasks)
-        chunks = [c for r in results for c in r.chunks]
-        scores = [s for r in results for s in r.scores]
+
+        chunks = []
+        scores = []
+
+        for vector_db_id, result in zip(vector_db_ids, results, strict=False):
+            for chunk, score in zip(result.chunks, result.scores, strict=False):
+                if not hasattr(chunk, "metadata") or chunk.metadata is None:
+                    chunk.metadata = {}
+                chunk.metadata["vector_db_id"] = vector_db_id
+
+                chunks.append(chunk)
+                scores.append(score)
 
         if not chunks:
             return RAGQueryResult(content=None)
@@ -203,6 +213,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
             metadata_keys_to_exclude_from_context = [
                 "token_count",
                 "metadata_token_count",
+                "vector_db_id",
             ]
             metadata_for_context = {}
             for k in chunk_metadata_keys_to_include_from_context:
@@ -227,6 +238,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
                 "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
                 "chunks": [c.content for c in chunks[: len(picked)]],
                 "scores": scores[: len(picked)],
+                "vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],
             },
         )
 
diff --git a/tests/unit/rag/test_rag_query.py b/tests/unit/rag/test_rag_query.py
index d18d90716..7b897bfe0 100644
--- a/tests/unit/rag/test_rag_query.py
+++ b/tests/unit/rag/test_rag_query.py
@@ -81,3 +81,58 @@ class TestRagQuery:
         # Test that invalid mode raises an error
         with pytest.raises(ValueError):
             RAGQueryConfig(mode="wrong_mode")
+
+    @pytest.mark.asyncio
+    async def test_query_adds_vector_db_id_to_chunk_metadata(self):
+        rag_tool = MemoryToolRuntimeImpl(
+            config=MagicMock(),
+            vector_io_api=MagicMock(),
+            inference_api=MagicMock(),
+        )
+
+        vector_db_ids = ["db1", "db2"]
+
+        # Fake chunks from each DB
+        chunk_metadata1 = ChunkMetadata(
+            document_id="doc1",
+            chunk_id="chunk1",
+            source="test_source1",
+            metadata_token_count=5,
+        )
+        chunk1 = Chunk(
+            content="chunk from db1",
+            metadata={"vector_db_id": "db1", "document_id": "doc1"},
+            stored_chunk_id="c1",
+            chunk_metadata=chunk_metadata1,
+        )
+
+        chunk_metadata2 = ChunkMetadata(
+            document_id="doc2",
+            chunk_id="chunk2",
+            source="test_source2",
+            metadata_token_count=5,
+        )
+        chunk2 = Chunk(
+            content="chunk from db2",
+            metadata={"vector_db_id": "db2", "document_id": "doc2"},
+            stored_chunk_id="c2",
+            chunk_metadata=chunk_metadata2,
+        )
+
+        rag_tool.vector_io_api.query_chunks = AsyncMock(
+            side_effect=[
+                QueryChunksResponse(chunks=[chunk1], scores=[0.9]),
+                QueryChunksResponse(chunks=[chunk2], scores=[0.8]),
+            ]
+        )
+
+        result = await rag_tool.query(content="test", vector_db_ids=vector_db_ids)
+        returned_chunks = result.metadata["chunks"]
+        returned_scores = result.metadata["scores"]
+        returned_doc_ids = result.metadata["document_ids"]
+        returned_vector_db_ids = result.metadata["vector_db_ids"]
+
+        assert returned_chunks == ["chunk from db1", "chunk from db2"]
+        assert returned_scores == (0.9, 0.8)
+        assert returned_doc_ids == ["doc1", "doc2"]
+        assert returned_vector_db_ids == ["db1", "db2"]

From 167143131053c8de6ea620a83ebdec41c0b24e50 Mon Sep 17 00:00:00 2001
From: Akram Ben Aissi <akram.benaissi@gmail.com>
Date: Wed, 10 Sep 2025 12:55:57 +0200
Subject: [PATCH 06/15] fix: Add missing files_api parameter to
 MemoryToolRuntimeImpl test (#3394)

# What does this PR do?
<!-- Provide a short summary of what this PR does and why. Link to
relevant issues if applicable. -->
The test_query_adds_vector_db_id_to_chunk_metadata test was failing
because MemoryToolRuntimeImpl.__init__() now requires a files_api
parameter.

Fixes failing unit tests for Python 3.12 and 3.13.

<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->

## Test Plan
<!-- Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.* -->
---
 tests/unit/rag/test_rag_query.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/rag/test_rag_query.py b/tests/unit/rag/test_rag_query.py
index 7b897bfe0..183b4d049 100644
--- a/tests/unit/rag/test_rag_query.py
+++ b/tests/unit/rag/test_rag_query.py
@@ -88,6 +88,7 @@ class TestRagQuery:
             config=MagicMock(),
             vector_io_api=MagicMock(),
             inference_api=MagicMock(),
+            files_api=MagicMock(),
         )
 
         vector_db_ids = ["db1", "db2"]

From c836fa29e3b9a587734764cc025551bce68fc349 Mon Sep 17 00:00:00 2001
From: Akram Ben Aissi <akram.benaissi@gmail.com>
Date: Wed, 10 Sep 2025 15:27:35 +0200
Subject: [PATCH 07/15] fix: pre-commit issues: non executable shebang file and
 removal of @pytest.mark.asyncio decorator  (#3397)

# What does this PR do?
<!-- Provide a short summary of what this PR does and why. Link to
relevant issues if applicable. -->
Fix pre-commit issues: non executable shebang file, @pytest.mark.asyncio
decorator

<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->

## Test Plan
<!-- Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.* -->
---
 scripts/get_setup_env.py         | 0
 tests/unit/rag/test_rag_query.py | 1 -
 2 files changed, 1 deletion(-)
 mode change 100644 => 100755 scripts/get_setup_env.py

diff --git a/scripts/get_setup_env.py b/scripts/get_setup_env.py
old mode 100644
new mode 100755
diff --git a/tests/unit/rag/test_rag_query.py b/tests/unit/rag/test_rag_query.py
index 183b4d049..a45b66f02 100644
--- a/tests/unit/rag/test_rag_query.py
+++ b/tests/unit/rag/test_rag_query.py
@@ -82,7 +82,6 @@ class TestRagQuery:
         with pytest.raises(ValueError):
             RAGQueryConfig(mode="wrong_mode")
 
-    @pytest.mark.asyncio
     async def test_query_adds_vector_db_id_to_chunk_metadata(self):
         rag_tool = MemoryToolRuntimeImpl(
             config=MagicMock(),

From 0e27016cf23eca51a0f025897b44109b1b609b71 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Wed, 10 Sep 2025 09:39:29 -0400
Subject: [PATCH 08/15] chore: update the vertexai inference impl to use
 openai-python for openai-compat functions (#3377)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

update VertexAI inference provider to use openai-python for
openai-compat functions

## Test Plan

```
$ VERTEX_AI_PROJECT=... uv run llama stack build --image-type venv --providers inference=remote::vertexai --run
...
$ LLAMA_STACK_CONFIG=http://localhost:8321 uv run --group test pytest -v -ra --text-model vertexai/vertex_ai/gemini-2.5-flash tests/integration/inference/test_openai_completion.py
...
```

i don't have an account to test this. `get_api_key` may also need to be
updated per
https://cloud.google.com/vertex-ai/generative-ai/docs/start/openai

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Sébastien Han <seb@redhat.com>
---
 llama_stack/providers/registry/inference.py   |  2 +-
 .../remote/inference/vertexai/vertexai.py     | 33 ++++++++++++++++---
 .../inference/test_openai_completion.py       |  3 ++
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index 4176f85a6..541fbb432 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -218,7 +218,7 @@ def available_providers() -> list[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="vertexai",
-                pip_packages=["litellm", "google-cloud-aiplatform"],
+                pip_packages=["litellm", "google-cloud-aiplatform", "openai"],
                 module="llama_stack.providers.remote.inference.vertexai",
                 config_class="llama_stack.providers.remote.inference.vertexai.VertexAIConfig",
                 provider_data_validator="llama_stack.providers.remote.inference.vertexai.config.VertexAIProviderDataValidator",
diff --git a/llama_stack/providers/remote/inference/vertexai/vertexai.py b/llama_stack/providers/remote/inference/vertexai/vertexai.py
index 8807fd0e6..27f953ab9 100644
--- a/llama_stack/providers/remote/inference/vertexai/vertexai.py
+++ b/llama_stack/providers/remote/inference/vertexai/vertexai.py
@@ -6,16 +6,20 @@
 
 from typing import Any
 
+import google.auth.transport.requests
+from google.auth import default
+
 from llama_stack.apis.inference import ChatCompletionRequest
 from llama_stack.providers.utils.inference.litellm_openai_mixin import (
     LiteLLMOpenAIMixin,
 )
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
 
 from .config import VertexAIConfig
 from .models import MODEL_ENTRIES
 
 
-class VertexAIInferenceAdapter(LiteLLMOpenAIMixin):
+class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
     def __init__(self, config: VertexAIConfig) -> None:
         LiteLLMOpenAIMixin.__init__(
             self,
@@ -27,9 +31,30 @@ class VertexAIInferenceAdapter(LiteLLMOpenAIMixin):
         self.config = config
 
     def get_api_key(self) -> str:
-        # Vertex AI doesn't use API keys, it uses Application Default Credentials
-        # Return empty string to let litellm handle authentication via ADC
-        return ""
+        """
+        Get an access token for Vertex AI using Application Default Credentials.
+
+        Vertex AI uses ADC instead of API keys. This method obtains an access token
+        from the default credentials and returns it for use with the OpenAI-compatible client.
+        """
+        try:
+            # Get default credentials - will read from GOOGLE_APPLICATION_CREDENTIALS
+            credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
+            credentials.refresh(google.auth.transport.requests.Request())
+            return credentials.token
+        except Exception:
+            # If we can't get credentials, return empty string to let LiteLLM handle it
+            # This allows the LiteLLM mixin to work with ADC directly
+            return ""
+
+    def get_base_url(self) -> str:
+        """
+        Get the Vertex AI OpenAI-compatible API base URL.
+
+        Returns the Vertex AI OpenAI-compatible endpoint URL.
+        Source: https://cloud.google.com/vertex-ai/generative-ai/docs/start/openai
+        """
+        return f"https://{self.config.location}-aiplatform.googleapis.com/v1/projects/{self.config.project}/locations/{self.config.location}/endpoints/openapi"
 
     async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
         # Get base parameters from parent
diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py
index df1184f1c..f9c837ebd 100644
--- a/tests/integration/inference/test_openai_completion.py
+++ b/tests/integration/inference/test_openai_completion.py
@@ -76,6 +76,9 @@ def skip_if_doesnt_support_n(client_with_models, model_id):
         "remote::gemini",
         # https://docs.anthropic.com/en/api/openai-sdk#simple-fields
         "remote::anthropic",
+        "remote::vertexai",
+        #  Error code: 400 - [{'error': {'code': 400, 'message': 'Unable to submit request because candidateCount must be 1 but
+        #  the entered value was 2. Update the candidateCount value and try again.', 'status': 'INVALID_ARGUMENT'}
     ):
         pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")
 

From c86e45496e32164286eb8920d2e979a45be31ea3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 10 Sep 2025 16:00:46 +0200
Subject: [PATCH 09/15] ci: Re-enable pre-commit to fail (#3399)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If pre-commit fails, the workflow must fail.

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/pre-commit.yml                            | 1 -
 llama_stack/providers/remote/inference/vertexai/vertexai.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 792162262..000208043 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -48,7 +48,6 @@ jobs:
         working-directory: llama_stack/ui
 
       - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
-        continue-on-error: true
         env:
           SKIP: no-commit-to-branch
           RUFF_OUTPUT_FORMAT: github
diff --git a/llama_stack/providers/remote/inference/vertexai/vertexai.py b/llama_stack/providers/remote/inference/vertexai/vertexai.py
index 27f953ab9..8996543e7 100644
--- a/llama_stack/providers/remote/inference/vertexai/vertexai.py
+++ b/llama_stack/providers/remote/inference/vertexai/vertexai.py
@@ -41,7 +41,7 @@ class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
             # Get default credentials - will read from GOOGLE_APPLICATION_CREDENTIALS
             credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
             credentials.refresh(google.auth.transport.requests.Request())
-            return credentials.token
+            return str(credentials.token)
         except Exception:
             # If we can't get credentials, return empty string to let LiteLLM handle it
             # This allows the LiteLLM mixin to work with ADC directly

From 935b8e28de29400a4b42d8b54169341c5244fec7 Mon Sep 17 00:00:00 2001
From: slekkala1 <swapna942@meta.com>
Date: Wed, 10 Sep 2025 08:48:01 -0700
Subject: [PATCH 10/15] fix: Fireworks chat completion broken due to telemetry
 (#3392)

# What does this PR do?
Fix fireworks chat completion broken due to telemetry expecting
response.usage
 Closes https://github.com/llamastack/llama-stack/issues/3391

## Test Plan
1. `uv run --with llama-stack llama stack build --distro starter
--image-type venv --run`
Try

```
curl -X POST http://0.0.0.0:8321/v1/openai/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
      "model": "fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct",
      "messages": [{"role": "user", "content": "Hello!"}]
    }'
```
```
{"id":"chatcmpl-ee922a08-0df0-4974-b0d3-b322113e8bc0","choices":[{"message":{"role":"assistant","content":"Hello! How can I assist you today?","name":null,"tool_calls":null},"finish_reason":"stop","index":0,"logprobs":null}],"object":"chat.completion","created":1757456375,"model":"fireworks/accounts/fireworks/models/llama-v3p1-8b-instruct"}%
```

Without fix fails as mentioned in
https://github.com/llamastack/llama-stack/issues/3391

Co-authored-by: Francisco Arceo <arceofrancisco@gmail.com>
---
 llama_stack/core/routers/inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 045093fe0..23972deb5 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -423,7 +423,7 @@ class InferenceRouter(Inference):
             # response_stream = await provider.openai_completion(**params)
 
         response = await provider.openai_completion(**params)
-        if self.telemetry:
+        if self.telemetry and getattr(response, "usage", None):
             metrics = self._construct_metrics(
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,
@@ -529,7 +529,7 @@ class InferenceRouter(Inference):
         if self.store:
             asyncio.create_task(self.store.store_chat_completion(response, messages))
 
-        if self.telemetry:
+        if self.telemetry and getattr(response, "usage", None):
             metrics = self._construct_metrics(
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,

From f6bf36343df7c69c9f26ae5163cbfb6491ca7247 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 10 Sep 2025 11:52:23 -0700
Subject: [PATCH 11/15] chore: logging perf improvments (#3393)

# What does this PR do?
- Use BackgroundLogger when logging metric events.
- Reuse event loop in BackgroundLogger

## Test Plan
```
cd /docs/source/distributions/k8s-benchmark
# start mock server
python openai-mock-server.py --port 8000
# start stack server
LLAMA_STACK_LOGGING="all=WARNING" uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml
# run benchmark script
uv run python3 benchmark.py --duration 120 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct
```
### RPS from 57 -> 62
---
 llama_stack/core/routers/inference.py         | 14 ++++----
 .../providers/utils/telemetry/tracing.py      | 34 +++++++++++++------
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 23972deb5..9593dd5b9 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -63,7 +63,7 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
 from llama_stack.models.llama.llama3.tokenizer import Tokenizer
 from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
 from llama_stack.providers.utils.inference.inference_store import InferenceStore
-from llama_stack.providers.utils.telemetry.tracing import get_current_span
+from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
 
 logger = get_logger(name=__name__, category="core::routers")
 
@@ -160,7 +160,7 @@ class InferenceRouter(Inference):
         metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
         if self.telemetry:
             for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
         return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
 
     async def _count_tokens(
@@ -431,7 +431,7 @@ class InferenceRouter(Inference):
                 model=model_obj,
             )
             for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
 
             # these metrics will show up in the client response.
             response.metrics = (
@@ -537,7 +537,7 @@ class InferenceRouter(Inference):
                 model=model_obj,
             )
             for metric in metrics:
-                await self.telemetry.log_event(metric)
+                enqueue_event(metric)
             # these metrics will show up in the client response.
             response.metrics = (
                 metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
@@ -664,7 +664,7 @@ class InferenceRouter(Inference):
                             "completion_tokens",
                             "total_tokens",
                         ]:  # Only log completion and total tokens
-                            await self.telemetry.log_event(metric)
+                            enqueue_event(metric)
 
                         # Return metrics in response
                         async_metrics = [
@@ -710,7 +710,7 @@ class InferenceRouter(Inference):
             )
             for metric in completion_metrics:
                 if metric.metric in ["completion_tokens", "total_tokens"]:  # Only log completion and total tokens
-                    await self.telemetry.log_event(metric)
+                    enqueue_event(metric)
 
             # Return metrics in response
             return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
@@ -806,7 +806,7 @@ class InferenceRouter(Inference):
                             model=model,
                         )
                         for metric in metrics:
-                            await self.telemetry.log_event(metric)
+                            enqueue_event(metric)
 
                 yield chunk
         finally:
diff --git a/llama_stack/providers/utils/telemetry/tracing.py b/llama_stack/providers/utils/telemetry/tracing.py
index 7694003b5..9969b1055 100644
--- a/llama_stack/providers/utils/telemetry/tracing.py
+++ b/llama_stack/providers/utils/telemetry/tracing.py
@@ -18,6 +18,7 @@ from functools import wraps
 from typing import Any
 
 from llama_stack.apis.telemetry import (
+    Event,
     LogSeverity,
     Span,
     SpanEndPayload,
@@ -98,7 +99,7 @@ class BackgroundLogger:
     def __init__(self, api: Telemetry, capacity: int = 100000):
         self.api = api
         self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
-        self.worker_thread = threading.Thread(target=self._process_logs, daemon=True)
+        self.worker_thread = threading.Thread(target=self._worker, daemon=True)
         self.worker_thread.start()
         self._last_queue_full_log_time: float = 0.0
         self._dropped_since_last_notice: int = 0
@@ -118,12 +119,16 @@ class BackgroundLogger:
                 self._last_queue_full_log_time = current_time
                 self._dropped_since_last_notice = 0
 
-    def _process_logs(self):
+    def _worker(self):
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(self._process_logs())
+
+    async def _process_logs(self):
         while True:
             try:
                 event = self.log_queue.get()
-                # figure out how to use a thread's native loop
-                asyncio.run(self.api.log_event(event))
+                await self.api.log_event(event)
             except Exception:
                 import traceback
 
@@ -136,6 +141,19 @@ class BackgroundLogger:
         self.log_queue.join()
 
 
+def enqueue_event(event: Event) -> None:
+    """Enqueue a telemetry event to the background logger if available.
+
+    This provides a non-blocking path for routers and other hot paths to
+    submit telemetry without awaiting the Telemetry API, reducing contention
+    with the main event loop.
+    """
+    global BACKGROUND_LOGGER
+    if BACKGROUND_LOGGER is None:
+        raise RuntimeError("Telemetry API not initialized")
+    BACKGROUND_LOGGER.log_event(event)
+
+
 class TraceContext:
     spans: list[Span] = []
 
@@ -256,11 +274,7 @@ class TelemetryHandler(logging.Handler):
         if record.module in ("asyncio", "selector_events"):
             return
 
-        global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER
-
-        if BACKGROUND_LOGGER is None:
-            raise RuntimeError("Telemetry API not initialized")
-
+        global CURRENT_TRACE_CONTEXT
         context = CURRENT_TRACE_CONTEXT.get()
         if context is None:
             return
@@ -269,7 +283,7 @@ class TelemetryHandler(logging.Handler):
         if span is None:
             return
 
-        BACKGROUND_LOGGER.log_event(
+        enqueue_event(
             UnstructuredLogEvent(
                 trace_id=span.trace_id,
                 span_id=span.span_id,

From a6b1588dc612df097d4fecce317547515b281ec6 Mon Sep 17 00:00:00 2001
From: Francisco Arceo <arceofrancisco@gmail.com>
Date: Wed, 10 Sep 2025 12:53:38 -0600
Subject: [PATCH 12/15] revert: Fireworks chat completion broken due to
 telemetry (#3402)

Reverts llamastack/llama-stack#3392
---
 llama_stack/core/routers/inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 9593dd5b9..2ed2d0439 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -423,7 +423,7 @@ class InferenceRouter(Inference):
             # response_stream = await provider.openai_completion(**params)
 
         response = await provider.openai_completion(**params)
-        if self.telemetry and getattr(response, "usage", None):
+        if self.telemetry:
             metrics = self._construct_metrics(
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,
@@ -529,7 +529,7 @@ class InferenceRouter(Inference):
         if self.store:
             asyncio.create_task(self.store.store_chat_completion(response, messages))
 
-        if self.telemetry and getattr(response, "usage", None):
+        if self.telemetry:
             metrics = self._construct_metrics(
                 prompt_tokens=response.usage.prompt_tokens,
                 completion_tokens=response.usage.completion_tokens,

From e6edc1f93425032f35f4198a197ba31b5b11d8ee Mon Sep 17 00:00:00 2001
From: Derek Higgins <derekh@redhat.com>
Date: Wed, 10 Sep 2025 19:54:10 +0100
Subject: [PATCH 13/15] fix: unbound variable error in
 schedule-record-workflow.sh (#3401)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Initialize INPUTS variable to prevent 'unbound variable' error

Fixes:
./scripts/github/schedule-record-workflow.sh: line 246: INPUTS: unbound
variable │
---
 scripts/github/schedule-record-workflow.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/github/schedule-record-workflow.sh b/scripts/github/schedule-record-workflow.sh
index c292e53e6..44b0947b6 100755
--- a/scripts/github/schedule-record-workflow.sh
+++ b/scripts/github/schedule-record-workflow.sh
@@ -239,8 +239,9 @@ echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
 echo ""
 
 # Prepare inputs for gh workflow run
+INPUTS=
 if [[ -n "$TEST_SUBDIRS" ]]; then
-    INPUTS="-f subdirs='$TEST_SUBDIRS'"
+    INPUTS="$INPUTS -f subdirs='$TEST_SUBDIRS'"
 fi
 if [[ -n "$TEST_SETUP" ]]; then
     INPUTS="$INPUTS -f test-setup='$TEST_SETUP'"

From e980436a2ed98dd725f76dfcec12235ed1d6cc82 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Wed, 10 Sep 2025 11:57:42 -0700
Subject: [PATCH 14/15] chore: introduce write queue for inference_store
 (#3383)

# What does this PR do?
Adds a write worker queue for writes to inference store. This avoids
overwhelming request processing with slow inference writes.

## Test Plan

Benchmark:
```
cd /docs/source/distributions/k8s-benchmark
# start mock server
python openai-mock-server.py --port 8000
# start stack server
LLAMA_STACK_LOGGING="all=WARNING" uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml
# run benchmark script
uv run python3 benchmark.py --duration 120 --concurrent 50 --base-url=http://localhost:8321/v1/openai/v1 --model=vllm-inference/meta-llama/Llama-3.2-3B-Instruct
```
## RPS from 21 -> 57
---
 .../distributions/k8s-benchmark/benchmark.py  | 19 ++--
 .../k8s-benchmark/stack_run_config.yaml       |  9 ++
 llama_stack/core/datatypes.py                 | 13 ++-
 llama_stack/core/routers/__init__.py          |  5 +-
 llama_stack/core/routers/inference.py         |  5 +
 .../utils/inference/inference_store.py        | 98 +++++++++++++++++--
 .../utils/inference/test_inference_store.py   | 12 +++
 7 files changed, 139 insertions(+), 22 deletions(-)

diff --git a/docs/source/distributions/k8s-benchmark/benchmark.py b/docs/source/distributions/k8s-benchmark/benchmark.py
index 3d0d18150..83ba9602a 100644
--- a/docs/source/distributions/k8s-benchmark/benchmark.py
+++ b/docs/source/distributions/k8s-benchmark/benchmark.py
@@ -58,14 +58,6 @@ class BenchmarkStats:
         
         print(f"\n{'='*60}")
         print(f"BENCHMARK RESULTS")
-        print(f"{'='*60}")
-        print(f"Total time: {total_time:.2f}s")
-        print(f"Concurrent users: {self.concurrent_users}")
-        print(f"Total requests: {self.total_requests}")
-        print(f"Successful requests: {self.success_count}")
-        print(f"Failed requests: {len(self.errors)}")
-        print(f"Success rate: {success_rate:.1f}%")
-        print(f"Requests per second: {self.success_count / total_time:.2f}")
         
         print(f"\nResponse Time Statistics:")
         print(f"  Mean: {statistics.mean(self.response_times):.3f}s")
@@ -106,6 +98,15 @@ class BenchmarkStats:
             print(f"  Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
             print(f"  Total chunks received: {sum(self.chunks_received)}")
         
+        print(f"{'='*60}")
+        print(f"Total time: {total_time:.2f}s")
+        print(f"Concurrent users: {self.concurrent_users}")
+        print(f"Total requests: {self.total_requests}")
+        print(f"Successful requests: {self.success_count}")
+        print(f"Failed requests: {len(self.errors)}")
+        print(f"Success rate: {success_rate:.1f}%")
+        print(f"Requests per second: {self.success_count / total_time:.2f}")
+        
         if self.errors:
             print(f"\nErrors (showing first 5):")
             for error in self.errors[:5]:
@@ -215,7 +216,7 @@ class LlamaStackBenchmark:
                         await asyncio.sleep(1)  # Report every second
                         if time.time() >= last_report_time + 10:  # Report every 10 seconds
                             elapsed = time.time() - stats.start_time
-                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
+                            print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}")
                             last_report_time = time.time()
                     except asyncio.CancelledError:
                         break
diff --git a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
index f8ff7811b..5a9e2ae4f 100644
--- a/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
+++ b/docs/source/distributions/k8s-benchmark/stack_run_config.yaml
@@ -2,6 +2,7 @@ version: '2'
 image_name: kubernetes-benchmark-demo
 apis:
 - agents
+- files
 - inference
 - files
 - safety
@@ -20,6 +21,14 @@ providers:
   - provider_id: sentence-transformers
     provider_type: inline::sentence-transformers
     config: {}
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
   vector_io:
   - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
     provider_type: remote::chromadb
diff --git a/llama_stack/core/datatypes.py b/llama_stack/core/datatypes.py
index 0f348b067..faaeefd01 100644
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@@ -431,6 +431,12 @@ class ServerConfig(BaseModel):
     )
 
 
+class InferenceStoreConfig(BaseModel):
+    sql_store_config: SqlStoreConfig
+    max_write_queue_size: int = Field(default=10000, description="Max queued writes for inference store")
+    num_writers: int = Field(default=4, description="Number of concurrent background writers")
+
+
 class StackRunConfig(BaseModel):
     version: int = LLAMA_STACK_RUN_CONFIG_VERSION
 
@@ -464,11 +470,12 @@ Configuration for the persistence store used by the distribution registry. If no
 a default SQLite store will be used.""",
     )
 
-    inference_store: SqlStoreConfig | None = Field(
+    inference_store: InferenceStoreConfig | SqlStoreConfig | None = Field(
         default=None,
         description="""
-Configuration for the persistence store used by the inference API. If not specified,
-a default SQLite store will be used.""",
+Configuration for the persistence store used by the inference API. Can be either a
+InferenceStoreConfig (with queue tuning parameters) or a SqlStoreConfig (deprecated).
+If not specified, a default SQLite store will be used.""",
     )
 
     # registry of "resources" in the distribution
diff --git a/llama_stack/core/routers/__init__.py b/llama_stack/core/routers/__init__.py
index 1faace34a..f129f8ede 100644
--- a/llama_stack/core/routers/__init__.py
+++ b/llama_stack/core/routers/__init__.py
@@ -78,7 +78,10 @@ async def get_auto_router_impl(
 
     # TODO: move pass configs to routers instead
     if api == Api.inference and run_config.inference_store:
-        inference_store = InferenceStore(run_config.inference_store, policy)
+        inference_store = InferenceStore(
+            config=run_config.inference_store,
+            policy=policy,
+        )
         await inference_store.initialize()
         api_to_dep_impl["store"] = inference_store
 
diff --git a/llama_stack/core/routers/inference.py b/llama_stack/core/routers/inference.py
index 2ed2d0439..762d7073e 100644
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@@ -90,6 +90,11 @@ class InferenceRouter(Inference):
 
     async def shutdown(self) -> None:
         logger.debug("InferenceRouter.shutdown")
+        if self.store:
+            try:
+                await self.store.shutdown()
+            except Exception as e:
+                logger.warning(f"Error during InferenceStore shutdown: {e}")
 
     async def register_model(
         self,
diff --git a/llama_stack/providers/utils/inference/inference_store.py b/llama_stack/providers/utils/inference/inference_store.py
index 43006cfd5..8c69b1683 100644
--- a/llama_stack/providers/utils/inference/inference_store.py
+++ b/llama_stack/providers/utils/inference/inference_store.py
@@ -3,6 +3,9 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
+from typing import Any
+
 from llama_stack.apis.inference import (
     ListOpenAIChatCompletionResponse,
     OpenAIChatCompletion,
@@ -10,24 +13,43 @@ from llama_stack.apis.inference import (
     OpenAIMessageParam,
     Order,
 )
-from llama_stack.core.datatypes import AccessRule
-from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR
+from llama_stack.core.datatypes import AccessRule, InferenceStoreConfig
+from llama_stack.log import get_logger
 
 from ..sqlstore.api import ColumnDefinition, ColumnType
 from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
-from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl
+from ..sqlstore.sqlstore import SqlStoreConfig, SqlStoreType, sqlstore_impl
+
+logger = get_logger(name=__name__, category="inference_store")
 
 
 class InferenceStore:
-    def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]):
-        if not sql_store_config:
-            sql_store_config = SqliteSqlStoreConfig(
-                db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(),
+    def __init__(
+        self,
+        config: InferenceStoreConfig | SqlStoreConfig,
+        policy: list[AccessRule],
+    ):
+        # Handle backward compatibility
+        if not isinstance(config, InferenceStoreConfig):
+            # Legacy: SqlStoreConfig passed directly as config
+            config = InferenceStoreConfig(
+                sql_store_config=config,
             )
-        self.sql_store_config = sql_store_config
+
+        self.config = config
+        self.sql_store_config = config.sql_store_config
         self.sql_store = None
         self.policy = policy
 
+        # Disable write queue for SQLite to avoid concurrency issues
+        self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
+
+        # Async write queue and worker control
+        self._queue: asyncio.Queue[tuple[OpenAIChatCompletion, list[OpenAIMessageParam]]] | None = None
+        self._worker_tasks: list[asyncio.Task[Any]] = []
+        self._max_write_queue_size: int = config.max_write_queue_size
+        self._num_writers: int = max(1, config.num_writers)
+
     async def initialize(self):
         """Create the necessary tables if they don't exist."""
         self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
@@ -42,10 +64,68 @@ class InferenceStore:
             },
         )
 
+        if self.enable_write_queue:
+            self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
+            for _ in range(self._num_writers):
+                self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
+        else:
+            logger.info("Write queue disabled for SQLite to avoid concurrency issues")
+
+    async def shutdown(self) -> None:
+        if not self._worker_tasks:
+            return
+        if self._queue is not None:
+            await self._queue.join()
+        for t in self._worker_tasks:
+            if not t.done():
+                t.cancel()
+        for t in self._worker_tasks:
+            try:
+                await t
+            except asyncio.CancelledError:
+                pass
+        self._worker_tasks.clear()
+
+    async def flush(self) -> None:
+        """Wait for all queued writes to complete. Useful for testing."""
+        if self.enable_write_queue and self._queue is not None:
+            await self._queue.join()
+
     async def store_chat_completion(
         self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
     ) -> None:
-        if not self.sql_store:
+        if self.enable_write_queue:
+            if self._queue is None:
+                raise ValueError("Inference store is not initialized")
+            try:
+                self._queue.put_nowait((chat_completion, input_messages))
+            except asyncio.QueueFull:
+                logger.warning(
+                    f"Write queue full; adding chat completion id={getattr(chat_completion, 'id', '<unknown>')}"
+                )
+                await self._queue.put((chat_completion, input_messages))
+        else:
+            await self._write_chat_completion(chat_completion, input_messages)
+
+    async def _worker_loop(self) -> None:
+        assert self._queue is not None
+        while True:
+            try:
+                item = await self._queue.get()
+            except asyncio.CancelledError:
+                break
+            chat_completion, input_messages = item
+            try:
+                await self._write_chat_completion(chat_completion, input_messages)
+            except Exception as e:  # noqa: BLE001
+                logger.error(f"Error writing chat completion: {e}")
+            finally:
+                self._queue.task_done()
+
+    async def _write_chat_completion(
+        self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
+    ) -> None:
+        if self.sql_store is None:
             raise ValueError("Inference store is not initialized")
 
         data = chat_completion.model_dump()
diff --git a/tests/unit/utils/inference/test_inference_store.py b/tests/unit/utils/inference/test_inference_store.py
index 730f54a05..f6d63490a 100644
--- a/tests/unit/utils/inference/test_inference_store.py
+++ b/tests/unit/utils/inference/test_inference_store.py
@@ -65,6 +65,9 @@ async def test_inference_store_pagination_basic():
             input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
             await store.store_chat_completion(completion, input_messages)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test 1: First page with limit=2, descending order (default)
         result = await store.list_chat_completions(limit=2, order=Order.desc)
         assert len(result.data) == 2
@@ -108,6 +111,9 @@ async def test_inference_store_pagination_ascending():
             input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
             await store.store_chat_completion(completion, input_messages)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test ascending order pagination
         result = await store.list_chat_completions(limit=1, order=Order.asc)
         assert len(result.data) == 1
@@ -143,6 +149,9 @@ async def test_inference_store_pagination_with_model_filter():
             input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
             await store.store_chat_completion(completion, input_messages)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test pagination with model filter
         result = await store.list_chat_completions(limit=1, model="model-a", order=Order.desc)
         assert len(result.data) == 1
@@ -190,6 +199,9 @@ async def test_inference_store_pagination_no_limit():
             input_messages = [OpenAIUserMessageParam(role="user", content=f"Test message for {completion_id}")]
             await store.store_chat_completion(completion, input_messages)
 
+        # Wait for all queued writes to complete
+        await store.flush()
+
         # Test without limit
         result = await store.list_chat_completions(order=Order.desc)
         assert len(result.data) == 2

From 7394828c7a84de2c3af0ca37546db17d6a703507 Mon Sep 17 00:00:00 2001
From: Alexey Rybak <50731695+reluctantfuturist@users.noreply.github.com>
Date: Wed, 10 Sep 2025 12:43:36 -0700
Subject: [PATCH 15/15] docs: horizontal nav bar (#3407)

# What does this PR do?
* Adds a horizontal nav bar for easy access to the API reference and the
Llama Stack Github repo

<img width="2696" height="520" alt="image"
src="https://github.com/user-attachments/assets/82daffe1-c206-4e20-b95b-1e090011eecc"
/>

## Test Plan
* Built the docs and ran the local HTML server to verify changes
---
 docs/_static/css/my_theme.css     | 101 ++++++++++++++++++++++++++++++
 docs/_static/js/horizontal_nav.js |  44 +++++++++++++
 docs/source/conf.py               |   1 +
 3 files changed, 146 insertions(+)
 create mode 100644 docs/_static/js/horizontal_nav.js

diff --git a/docs/_static/css/my_theme.css b/docs/_static/css/my_theme.css
index d078ec057..7dcd97c9b 100644
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@@ -1,5 +1,106 @@
 @import url("theme.css");
 
+/* Horizontal Navigation Bar */
+.horizontal-nav {
+    background-color: #ffffff;
+    border-bottom: 1px solid #e5e5e5;
+    padding: 0;
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    z-index: 1050;
+    height: 50px;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+}
+
+[data-theme="dark"] .horizontal-nav {
+    background-color: #1a1a1a;
+    border-bottom: 1px solid #333;
+}
+
+.horizontal-nav .nav-container {
+    max-width: 1200px;
+    margin: 0 auto;
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+    padding: 0 20px;
+    height: 100%;
+}
+
+.horizontal-nav .nav-brand {
+    font-size: 18px;
+    font-weight: 600;
+    color: #333;
+    text-decoration: none;
+}
+
+[data-theme="dark"] .horizontal-nav .nav-brand {
+    color: #fff;
+}
+
+.horizontal-nav .nav-links {
+    display: flex;
+    align-items: center;
+    gap: 30px;
+    list-style: none;
+    margin: 0;
+    padding: 0;
+}
+
+.horizontal-nav .nav-links a {
+    color: #666;
+    text-decoration: none;
+    font-size: 14px;
+    font-weight: 500;
+    padding: 8px 12px;
+    border-radius: 6px;
+    transition: all 0.2s ease;
+}
+
+.horizontal-nav .nav-links a:hover,
+.horizontal-nav .nav-links a.active {
+    color: #333;
+    background-color: #f5f5f5;
+}
+
+.horizontal-nav .nav-links a.active {
+    font-weight: 600;
+}
+
+[data-theme="dark"] .horizontal-nav .nav-links a {
+    color: #ccc;
+}
+
+[data-theme="dark"] .horizontal-nav .nav-links a:hover,
+[data-theme="dark"] .horizontal-nav .nav-links a.active {
+    color: #fff;
+    background-color: #333;
+}
+
+.horizontal-nav .nav-links .github-link {
+    display: flex;
+    align-items: center;
+    gap: 6px;
+}
+
+.horizontal-nav .nav-links .github-icon {
+    width: 16px;
+    height: 16px;
+    fill: currentColor;
+}
+
+/* Adjust main content to account for fixed nav */
+.wy-nav-side {
+    top: 50px;
+    height: calc(100vh - 50px);
+}
+
+.wy-nav-content-wrap {
+    margin-top: 50px;
+}
+
 .wy-nav-content {
     max-width: 90%;
 }
diff --git a/docs/_static/js/horizontal_nav.js b/docs/_static/js/horizontal_nav.js
new file mode 100644
index 000000000..c2384f9d5
--- /dev/null
+++ b/docs/_static/js/horizontal_nav.js
@@ -0,0 +1,44 @@
+// Horizontal Navigation Bar for Llama Stack Documentation
+document.addEventListener('DOMContentLoaded', function() {
+    // Create the horizontal navigation HTML
+    const navHTML = `
+        <nav class="horizontal-nav">
+            <div class="nav-container">
+                <a href="/" class="nav-brand">Llama Stack</a>
+                <ul class="nav-links">
+                    <li><a href="/">Docs</a></li>
+                    <li><a href="/references/api_reference/">API Reference</a></li>
+                    <li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
+                        <svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
+                            <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
+                        </svg>
+                        GitHub
+                    </a></li>
+                </ul>
+            </div>
+        </nav>
+    `;
+
+    // Insert the navigation at the beginning of the body
+    document.body.insertAdjacentHTML('afterbegin', navHTML);
+
+    // Update navigation links based on current page
+    updateActiveNav();
+});
+
+function updateActiveNav() {
+    const currentPath = window.location.pathname;
+    const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
+
+    navLinks.forEach(link => {
+        // Remove any existing active classes
+        link.classList.remove('active');
+
+        // Add active class based on current path
+        if (currentPath === '/' && link.getAttribute('href') === '/') {
+            link.classList.add('active');
+        } else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
+            link.classList.add('active');
+        }
+    });
+}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3f84d1310..0cbddef31 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -131,6 +131,7 @@ html_static_path = ["../_static"]
 def setup(app):
     app.add_css_file("css/my_theme.css")
     app.add_js_file("js/detect_theme.js")
+    app.add_js_file("js/horizontal_nav.js")
     app.add_js_file("js/keyboard_shortcuts.js")
 
     def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):