Merge branch 'main' into add-mcp-authentication-param

2025-12-03 09:53:45 +00:00 · 2025-11-07 14:26:06 -08:00 · 2025-11-07 14:26:06 -08:00 · 1a7ba683e3
commit 1a7ba683e3
parent 9e972cf20c 8f4c431370
1075 changed files with 125472 additions and 3083 deletions
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -72,7 +72,8 @@ runs:
          echo "New recordings detected, committing and pushing"
          git add tests/integration/
-          git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
+          git commit -m "Recordings update from CI (setup: ${{ inputs.setup }}, suite: ${{ inputs.suite }})"
          git fetch origin ${{ github.ref_name }}
          git rebase origin/${{ github.ref_name }}
          echo "Rebased successfully"
@ -88,6 +89,8 @@ runs:
      run: |
        # Ollama logs (if ollama container exists)
        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
        # vllm logs (if vllm container exists)
        sudo docker logs vllm > vllm-${{ inputs.inference-mode }}.log 2>&1 || true
        # Note: distro container logs are now dumped in integration-tests.sh before container is removed
    - name: Upload logs
--- a/.github/actions/setup-vllm/action.yml
+++ b/.github/actions/setup-vllm/action.yml
@ -11,13 +11,14 @@ runs:
          --name vllm \
          -p 8000:8000 \
          --privileged=true \
-          quay.io/higginsd/vllm-cpu:65393ee064 \
+          quay.io/higginsd/vllm-cpu:65393ee064-qwen3 \
          --host 0.0.0.0 \
          --port 8000 \
          --enable-auto-tool-choice \
-          --tool-call-parser llama3_json \
+          --tool-call-parser hermes \
-          --model /root/.cache/Llama-3.2-1B-Instruct \
+          --model /root/.cache/Qwen3-0.6B \
-          --served-model-name meta-llama/Llama-3.2-1B-Instruct
+          --served-model-name Qwen/Qwen3-0.6B \
          --max-model-len 8192
          # Wait for vllm to be ready
          echo "Waiting for vllm to be ready..."
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -18,6 +18,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
 | Stainless SDK Builds | [stainless-builds.yml](stainless-builds.yml) | Build Stainless SDK from OpenAPI spec changes |
 | Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
 | Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
 | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -27,7 +27,6 @@ on:
  schedule:
    # If changing the cron schedule, update the provider in the test-matrix job
    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
  workflow_dispatch:
    inputs:
      test-all-client-versions:
--- a/.github/workflows/stainless-builds.yml
+++ b/.github/workflows/stainless-builds.yml
@ -0,0 +1,110 @@
 name: Stainless SDK Builds
 run-name: Build Stainless SDK from OpenAPI spec changes
 # This workflow uses pull_request_target, which allows it to run on pull requests
 # from forks with access to secrets. This is safe because the workflow definition
 # comes from the base branch (trusted), and the action only reads OpenAPI spec
 # files without executing any code from the PR.
 on:
  pull_request_target:
    types:
      - opened
      - synchronize
      - reopened
      - closed
    paths:
      - "client-sdks/stainless/**"
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
  cancel-in-progress: true
 env:
  # Stainless organization name.
  STAINLESS_ORG: llamastack
  # Stainless project name.
  STAINLESS_PROJECT: llama-stack-client
  # Path to your OpenAPI spec.
  OAS_PATH: ./client-sdks/stainless/openapi.yml
  # Path to your Stainless config. Optional; only provide this if you prefer
  # to maintain the ground truth Stainless config in your own repo.
  CONFIG_PATH: ./client-sdks/stainless/config.yml
  # When to fail the job based on build conclusion.
  # Options: "never" | "note" | "warning" | "error" | "fatal".
  FAIL_ON: error
  # In your repo secrets, configure:
  # - STAINLESS_API_KEY: a Stainless API key, which you can generate on the
  #   Stainless organization dashboard
 jobs:
  preview:
    if: github.event.action != 'closed'
    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: write
    steps:
      # Checkout the PR's code to access the OpenAPI spec and config files.
      # This is necessary to read the spec/config from the PR (including from forks).
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          repository: ${{ github.event.pull_request.head.repo.full_name }}
          ref: ${{ github.event.pull_request.head.sha }}
          fetch-depth: 2
      # This action builds preview SDKs from the OpenAPI spec changes and
      # posts/updates a comment on the PR with build results and links to the preview.
      - name: Run preview builds
        uses: stainless-api/upload-openapi-spec-action/preview@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
        with:
          stainless_api_key: ${{ secrets.STAINLESS_API_KEY }}
          org: ${{ env.STAINLESS_ORG }}
          project: ${{ env.STAINLESS_PROJECT }}
          oas_path: ${{ env.OAS_PATH }}
          config_path: ${{ env.CONFIG_PATH }}
          fail_on: ${{ env.FAIL_ON }}
          base_sha: ${{ github.event.pull_request.base.sha }}
          base_ref: ${{ github.event.pull_request.base.ref }}
          head_sha: ${{ github.event.pull_request.head.sha }}
  merge:
    if: github.event.action == 'closed' && github.event.pull_request.merged == true
    runs-on: ubuntu-latest
    permissions:
      contents: read
      pull-requests: write
    steps:
      # Checkout the PR's code to access the OpenAPI spec and config files.
      # This is necessary to read the spec/config from the PR (including from forks).
      - name: Checkout repository
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          repository: ${{ github.event.pull_request.head.repo.full_name }}
          ref: ${{ github.event.pull_request.head.sha }}
          fetch-depth: 2
      # Note that this only merges in changes that happened on the last build on
      # preview/${{ github.head_ref }}. It's possible that there are OAS/config
      # changes that haven't been built, if the preview-sdk job didn't finish
      # before this step starts. In theory we want to wait for all builds
      # against preview/${{ github.head_ref }} to complete, but assuming that
      # the preview-sdk job happens before the PR merge, it should be fine.
      - name: Run merge build
        uses: stainless-api/upload-openapi-spec-action/merge@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
        with:
          stainless_api_key: ${{ secrets.STAINLESS_API_KEY }}
          org: ${{ env.STAINLESS_ORG }}
          project: ${{ env.STAINLESS_PROJECT }}
          oas_path: ${{ env.OAS_PATH }}
          config_path: ${{ env.CONFIG_PATH }}
          fail_on: ${{ env.FAIL_ON }}
          base_sha: ${{ github.event.pull_request.base.sha }}
          base_ref: ${{ github.event.pull_request.base.ref }}
          head_sha: ${{ github.event.pull_request.head.sha }}
--- a/client-sdks/stainless/README.md
+++ b/client-sdks/stainless/README.md
@ -1,8 +1,8 @@
 These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless.
 - `openapi.yml`: this is the OpenAPI specification for the Llama Stack API.
- `openapi.stainless.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
+- `config.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
 A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.
-These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.
+These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.
--- a/client-sdks/stainless/config-not-source-of-truth-yet.yml
+++ b/client-sdks/stainless/config-not-source-of-truth-yet.yml
@ -340,21 +340,18 @@ resources:
        endpoint: get /v1/inspect/routes
        paginated: false
  moderations:
    models:
      create_response: ModerationObject
    methods:
      create: post /v1/moderations
  safety:
    models:
      run_shield_response: RunShieldResponse
    methods:
      run_shield: post /v1/safety/run-shield
  shields:
    models:
      shield: Shield
@ -463,10 +460,9 @@ resources:
          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
 settings:
  license: MIT
-  unwrap_response_fields: [ data ]
+  unwrap_response_fields: [data]
 openapi:
  transformations:
@ -474,7 +470,7 @@ openapi:
      reason: Better return_type using enum
      args:
        target:
-          - '$.components.schemas'
+          - "$.components.schemas"
        object:
          ReturnType:
            additionalProperties: false
@ -499,10 +495,10 @@ openapi:
      args:
        filter:
          only:
-            - '$.components.schemas.ScoringFn.properties.return_type'
+            - "$.components.schemas.ScoringFn.properties.return_type"
-            - '$.components.schemas.RegisterScoringFunctionRequest.properties.return_type'
+            - "$.components.schemas.RegisterScoringFunctionRequest.properties.return_type"
        value:
-          $ref: '#/components/schemas/ReturnType'
+          $ref: "#/components/schemas/ReturnType"
    - command: oneOfToAnyOf
      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -6791,6 +6791,8 @@ components:
              const: web_search_preview
            - type: string
              const: web_search_preview_2025_03_11
            - type: string
              const: web_search_2025_08_26
          default: web_search
          description: Web search tool type variant to use
        search_context_size:
@ -9982,6 +9984,70 @@ components:
        - metadata
      title: VectorStoreObject
      description: OpenAI Vector Store object.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreRequestWithExtraBody":
      type: object
      properties:
@ -10007,15 +10073,7 @@ components:
          description: >-
            (Optional) Expiration policy for the vector store
        chunking_strategy:
-          type: object
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Strategy for splitting files into chunks
        metadata:
@ -10091,70 +10149,6 @@ components:
        - deleted
      title: VectorStoreDeleteResponse
      description: Response from deleting a vector store.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreFileBatchRequestWithExtraBody":
      type: object
      properties:
@ -10612,7 +10606,9 @@ components:
          description: >-
            Object type identifier for the search results page
        search_query:
-          type: string
+          type: array
          items:
            type: string
          description: >-
            The original search query that was executed
        data:
--- a/docs/docs/building_applications/index.mdx
+++ b/docs/docs/building_applications/index.mdx
@ -35,9 +35,6 @@ Here are the key topics that will help you build effective AI applications:
 - **[Telemetry](./telemetry.mdx)** - Monitor and analyze your agents' performance and behavior
 - **[Safety](./safety.mdx)** - Implement guardrails and safety measures to ensure responsible AI behavior
 ### 🎮 **Interactive Development**
 - **[Playground](./playground.mdx)** - Interactive environment for testing and developing applications
 ## Application Patterns
 ### 🤖 **Conversational Agents**
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@ -1,298 +0,0 @@
 ---
 title: Llama Stack Playground
 description: Interactive interface to explore and experiment with Llama Stack capabilities
 sidebar_label: Playground
 sidebar_position: 10
 ---
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # Llama Stack Playground
 :::note[Experimental Feature]
 The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
 :::
 The Llama Stack Playground is a simple interface that aims to:
 - **Showcase capabilities and concepts** of Llama Stack in an interactive environment
 - **Demo end-to-end application code** to help users get started building their own applications
 - **Provide a UI** to help users inspect and understand Llama Stack API providers and resources
 ## Key Features
 ### Interactive Playground Pages
 The playground provides interactive pages for users to explore Llama Stack API capabilities:
 #### Chatbot Interface
 <video
  controls
  autoPlay
  playsInline
  muted
  loop
  style={{width: '100%'}}
 >
  <source src="https://github.com/user-attachments/assets/8d2ef802-5812-4a28-96e1-316038c84cbf" type="video/mp4" />
  Your browser does not support the video tag.
 </video>
 <Tabs>
 <TabItem value="chat" label="Chat">
 **Simple Chat Interface**
 - Chat directly with Llama models through an intuitive interface
 - Uses the `/chat/completions` streaming API under the hood
 - Real-time message streaming for responsive interactions
 - Perfect for testing model capabilities and prompt engineering
 </TabItem>
 <TabItem value="rag" label="RAG Chat">
 **Document-Aware Conversations**
 - Upload documents to create memory banks
 - Chat with a RAG-enabled agent that can query your documents
 - Uses Llama Stack's `/agents` API to create and manage RAG sessions
 - Ideal for exploring knowledge-enhanced AI applications
 </TabItem>
 </Tabs>
 #### Evaluation Interface
 <video
  controls
  autoPlay
  playsInline
  muted
  loop
  style={{width: '100%'}}
 >
  <source src="https://github.com/user-attachments/assets/6cc1659f-eba4-49ca-a0a5-7c243557b4f5" type="video/mp4" />
  Your browser does not support the video tag.
 </video>
 <Tabs>
 <TabItem value="scoring" label="Scoring Evaluations">
 **Custom Dataset Evaluation**
 - Upload your own evaluation datasets
 - Run evaluations using available scoring functions
 - Uses Llama Stack's `/scoring` API for flexible evaluation workflows
 - Great for testing application performance on custom metrics
 </TabItem>
 <TabItem value="benchmarks" label="Benchmark Evaluations">
 <video
  controls
  autoPlay
  playsInline
  muted
  loop
  style={{width: '100%', marginBottom: '1rem'}}
 >
  <source src="https://github.com/user-attachments/assets/345845c7-2a2b-4095-960a-9ae40f6a93cf" type="video/mp4" />
  Your browser does not support the video tag.
 </video>
 **Pre-registered Evaluation Tasks**
 - Evaluate models or agents on pre-defined tasks
 - Uses Llama Stack's `/eval` API for comprehensive evaluation
 - Combines datasets and scoring functions for standardized testing
 **Setup Requirements:**
 Register evaluation datasets and benchmarks first:
 ```bash
 # Register evaluation dataset
 llama-stack-client datasets register \
  --dataset-id "mmlu" \
  --provider-id "huggingface" \
  --url "https://huggingface.co/datasets/llamastack/evals" \
  --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
  --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string"}, "chat_completion_input": {"type": "string"}}'
 # Register benchmark task
 llama-stack-client benchmarks register \
  --eval-task-id meta-reference-mmlu \
  --provider-id meta-reference \
  --dataset-id mmlu \
  --scoring-functions basic::regex_parser_multiple_choice_answer
 ```
 </TabItem>
 </Tabs>
 #### Inspection Interface
 <video
  controls
  autoPlay
  playsInline
  muted
  loop
  style={{width: '100%'}}
 >
  <source src="https://github.com/user-attachments/assets/01d52b2d-92af-4e3a-b623-a9b8ba22ba99" type="video/mp4" />
  Your browser does not support the video tag.
 </video>
 <Tabs>
 <TabItem value="providers" label="API Providers">
 **Provider Management**
 - Inspect available Llama Stack API providers
 - View provider configurations and capabilities
 - Uses the `/providers` API for real-time provider information
 - Essential for understanding your deployment's capabilities
 </TabItem>
 <TabItem value="resources" label="API Resources">
 **Resource Exploration**
 - Inspect Llama Stack API resources including:
  - **Models**: Available language models
  - **Datasets**: Registered evaluation datasets
  - **Memory Banks**: Vector databases and knowledge stores
  - **Benchmarks**: Evaluation tasks and scoring functions
  - **Shields**: Safety and content moderation tools
 - Uses `/<resources>/list` APIs for comprehensive resource visibility
 - For detailed information about resources, see [Core Concepts](/docs/concepts)
 </TabItem>
 </Tabs>
 ## Getting Started
 ### Quick Start Guide
 <Tabs>
 <TabItem value="setup" label="Setup">
 **1. Start the Llama Stack API Server**
 ```bash
 llama stack list-deps together | xargs -L1 uv pip install
 llama stack run together
 ```
 **2. Start the Streamlit UI**
 ```bash
 # Launch the playground interface
 uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
 ```
 </TabItem>
 <TabItem value="usage" label="Usage Tips">
 **Making the Most of the Playground:**
 - **Start with Chat**: Test basic model interactions and prompt engineering
 - **Explore RAG**: Upload sample documents to see knowledge-enhanced responses
 - **Try Evaluations**: Use the scoring interface to understand evaluation metrics
 - **Inspect Resources**: Check what providers and resources are available
 - **Experiment with Settings**: Adjust parameters to see how they affect results
 </TabItem>
 </Tabs>
 ### Available Distributions
 The playground works with any Llama Stack distribution. Popular options include:
 <Tabs>
 <TabItem value="together" label="Together AI">
 ```bash
 llama stack list-deps together | xargs -L1 uv pip install
 llama stack run together
 ```
 **Features:**
 - Cloud-hosted models
 - Fast inference
 - Multiple model options
 </TabItem>
 <TabItem value="ollama" label="Ollama (Local)">
 ```bash
 llama stack list-deps ollama | xargs -L1 uv pip install
 llama stack run ollama
 ```
 **Features:**
 - Local model execution
 - Privacy-focused
 - No internet required
 </TabItem>
 <TabItem value="meta-reference" label="Meta Reference">
 ```bash
 llama stack list-deps meta-reference | xargs -L1 uv pip install
 llama stack run meta-reference
 ```
 **Features:**
 - Reference implementation
 - All API features available
 - Best for development
 </TabItem>
 </Tabs>
 ## Use Cases & Examples
 ### Educational Use Cases
 - **Learning Llama Stack**: Hands-on exploration of API capabilities
 - **Prompt Engineering**: Interactive testing of different prompting strategies
 - **RAG Experimentation**: Understanding how document retrieval affects responses
 - **Evaluation Understanding**: See how different metrics evaluate model performance
 ### Development Use Cases
 - **Prototype Testing**: Quick validation of application concepts
 - **API Exploration**: Understanding available endpoints and parameters
 - **Integration Planning**: Seeing how different components work together
 - **Demo Creation**: Showcasing Llama Stack capabilities to stakeholders
 ### Research Use Cases
 - **Model Comparison**: Side-by-side testing of different models
 - **Evaluation Design**: Understanding how scoring functions work
 - **Safety Testing**: Exploring shield effectiveness with different inputs
 - **Performance Analysis**: Measuring model behavior across different scenarios
 ## Best Practices
 ### 🚀 **Getting Started**
 - Begin with simple chat interactions to understand basic functionality
 - Gradually explore more advanced features like RAG and evaluations
 - Use the inspection tools to understand your deployment's capabilities
 ### 🔧 **Development Workflow**
 - Use the playground to prototype before writing application code
 - Test different parameter settings interactively
 - Validate evaluation approaches before implementing them programmatically
 ### 📊 **Evaluation & Testing**
 - Start with simple scoring functions before trying complex evaluations
 - Use the playground to understand evaluation results before automation
 - Test safety features with various input types
 ### 🎯 **Production Preparation**
 - Use playground insights to inform your production API usage
 - Test edge cases and error conditions interactively
 - Validate resource configurations before deployment
 ## Related Resources
 - **[Getting Started Guide](../getting_started/quickstart)** - Complete setup and introduction
 - **[Core Concepts](/docs/concepts)** - Understanding Llama Stack fundamentals
 - **[Agents](./agent)** - Building intelligent agents
 - **[RAG (Retrieval Augmented Generation)](./rag)** - Knowledge-enhanced applications
 - **[Evaluations](./evals)** - Comprehensive evaluation framework
 - **[API Reference](/docs/api/llama-stack-specification)** - Complete API documentation
--- a/docs/docs/distributions/importing_as_library.mdx
+++ b/docs/docs/distributions/importing_as_library.mdx
@ -11,7 +11,7 @@ If you are planning to use an external service for Inference (even Ollama or TGI
 This avoids the overhead of setting up a server.
 ```bash
 # setup
-uv pip install llama-stack
+uv pip install llama-stack llama-stack-client
 llama stack list-deps starter | xargs -L1 uv pip install
 ```
--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@ -163,7 +163,41 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```
-### Via venv
+The container will run the distribution with a SQLite store by default. This store is used for the following components:
 - Metadata store: store metadata about the models, providers, etc.
 - Inference store: collect of responses from the inference provider
 - Agents store: store agent configurations (sessions, turns, etc.)
 - Agents Responses store: store responses from the agents
 However, you can use PostgreSQL instead by running the `starter::run-with-postgres-store.yaml` configuration:
 ```bash
 docker run \
  -it \
  --pull always \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -e OPENAI_API_KEY=your_openai_key \
  -e FIREWORKS_API_KEY=your_fireworks_key \
  -e TOGETHER_API_KEY=your_together_key \
  -e POSTGRES_HOST=your_postgres_host \
  -e POSTGRES_PORT=your_postgres_port \
  -e POSTGRES_DB=your_postgres_db \
  -e POSTGRES_USER=your_postgres_user \
  -e POSTGRES_PASSWORD=your_postgres_password \
  llamastack/distribution-starter \
  starter::run-with-postgres-store.yaml
 ```
 Postgres environment variables:
 - `POSTGRES_HOST`: Postgres host (default: `localhost`)
 - `POSTGRES_PORT`: Postgres port (default: `5432`)
 - `POSTGRES_DB`: Postgres database name (default: `llamastack`)
 - `POSTGRES_USER`: Postgres username (default: `llamastack`)
 - `POSTGRES_PASSWORD`: Postgres password (default: `llamastack`)
 ### Via Conda or venv
 Ensure you have configured the starter distribution using the environment variables explained above.
@ -171,8 +205,11 @@ Ensure you have configured the starter distribution using the environment variab
 # Install dependencies for the starter distribution
 uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
-# Run the server
+# Run the server (with SQLite - default)
 uv run --with llama-stack llama stack run starter
 # Or run with PostgreSQL
 uv run --with llama-stack llama stack run starter::run-with-postgres-store.yaml
 ```
 ## Example Usage
--- a/docs/docs/providers/inference/remote_bedrock.mdx
+++ b/docs/docs/providers/inference/remote_bedrock.mdx
@ -1,5 +1,5 @@
 ---
-description: "AWS Bedrock inference provider for accessing various AI models through AWS's managed service."
+description: "AWS Bedrock inference provider using OpenAI compatible endpoint."
 sidebar_label: Remote - Bedrock
 title: remote::bedrock
 ---
@ -8,7 +8,7 @@ title: remote::bedrock
 ## Description
-AWS Bedrock inference provider for accessing various AI models through AWS's managed service.
+AWS Bedrock inference provider using OpenAI compatible endpoint.
 ## Configuration
@ -16,19 +16,12 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `aws_access_key_id` | `str \| None` | No |  | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
-| `aws_secret_access_key` | `str \| None` | No |  | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
+| `region_name` | `<class 'str'>` | No | us-east-2 | AWS Region for the Bedrock Runtime endpoint |
 | `aws_session_token` | `str \| None` | No |  | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
 | `region_name` | `str \| None` | No |  | The default AWS Region to use, for example, us-west-1 or us-west-2.Default use environment variable: AWS_DEFAULT_REGION |
 | `profile_name` | `str \| None` | No |  | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
 | `total_max_attempts` | `int \| None` | No |  | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
 | `retry_mode` | `str \| None` | No |  | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
 | `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
 | `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
 | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
 ## Sample Configuration
 ```yaml
-{}
+api_key: ${env.AWS_BEDROCK_API_KEY:=}
 region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
 ```
--- a/docs/docs/providers/inference/remote_passthrough.mdx
+++ b/docs/docs/providers/inference/remote_passthrough.mdx
@ -16,7 +16,7 @@ Passthrough inference provider for connecting to any external inference service
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | API Key for the passthrouth endpoint |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No |  | The URL for the passthrough endpoint |
 ## Sample Configuration
--- a/docs/docs/providers/openai_responses_limitations.mdx
+++ b/docs/docs/providers/openai_responses_limitations.mdx
@ -48,11 +48,9 @@ Both OpenAI and Llama Stack support a web-search built-in tool.  The [OpenAI doc
 > The type of the web search tool. One of `web_search` or `web_search_2025_08_26`.
-In contrast, the [Llama Stack documentation](https://llamastack.github.io/docs/api/create-a-new-open-ai-response) says that the allowed values for `type` for web search are `MOD1`, `MOD2` and `MOD3`.
+Llama Stack now supports both `web_search` and `web_search_2025_08_26` types, matching OpenAI's API. For backward compatibility, Llama Stack also supports `web_search_preview` and `web_search_preview_2025_03_11` types.
 Is that correct?  If so, what are the meanings of each of them?  It might make sense for the allowed values for OpenAI map to some values for Llama Stack so that code written to the OpenAI specification
 also work with Llama Stack.
-The OpenAI web search tool also has fields for `filters` and `user_location` which are not documented as options for Llama Stack.  If feasible, it would be good to support these too.
+The OpenAI web search tool also has fields for `filters` and `user_location` which are not yet implemented in Llama Stack.  If feasible, it would be good to support these too.
 ---
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
@ -37,7 +37,7 @@
   "outputs": [],
   "source": [
    "# NBVAL_SKIP\n",
-    "!pip install -U llama-stack\n",
+    "!pip install -U llama-stack llama-stack-client\n",
    "llama stack list-deps fireworks | xargs -L1 uv pip install\n"
   ]
  },
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -44,7 +44,7 @@
   "outputs": [],
   "source": [
    "# NBVAL_SKIP\n",
-    "!pip install -U llama-stack"
+    "!pip install -U llama-stack llama-stack-client\n"
   ]
  },
  {
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@ -74,6 +74,7 @@
   "source": [
    "```bash\n",
    "uv sync --extra dev\n",
    "uv pip install -U llama-stack-client\n",
    "uv pip install -e .\n",
    "source .venv/bin/activate\n",
    "```"
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -6075,6 +6075,8 @@ components:
              const: web_search_preview
            - type: string
              const: web_search_preview_2025_03_11
            - type: string
              const: web_search_2025_08_26
          default: web_search
          description: Web search tool type variant to use
        search_context_size:
@ -9266,6 +9268,70 @@ components:
        - metadata
      title: VectorStoreObject
      description: OpenAI Vector Store object.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreRequestWithExtraBody":
      type: object
      properties:
@ -9291,15 +9357,7 @@ components:
          description: >-
            (Optional) Expiration policy for the vector store
        chunking_strategy:
-          type: object
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Strategy for splitting files into chunks
        metadata:
@ -9375,70 +9433,6 @@ components:
        - deleted
      title: VectorStoreDeleteResponse
      description: Response from deleting a vector store.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreFileBatchRequestWithExtraBody":
      type: object
      properties:
@ -9896,7 +9890,9 @@ components:
          description: >-
            Object type identifier for the search results page
        search_query:
-          type: string
+          type: array
          items:
            type: string
          description: >-
            The original search query that was executed
        data:
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -6791,6 +6791,8 @@ components:
              const: web_search_preview
            - type: string
              const: web_search_preview_2025_03_11
            - type: string
              const: web_search_2025_08_26
          default: web_search
          description: Web search tool type variant to use
        search_context_size:
@ -9982,6 +9984,70 @@ components:
        - metadata
      title: VectorStoreObject
      description: OpenAI Vector Store object.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreRequestWithExtraBody":
      type: object
      properties:
@ -10007,15 +10073,7 @@ components:
          description: >-
            (Optional) Expiration policy for the vector store
        chunking_strategy:
-          type: object
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
          additionalProperties:
            oneOf:
              - type: 'null'
              - type: boolean
              - type: number
              - type: string
              - type: array
              - type: object
          description: >-
            (Optional) Strategy for splitting files into chunks
        metadata:
@ -10091,70 +10149,6 @@ components:
        - deleted
      title: VectorStoreDeleteResponse
      description: Response from deleting a vector store.
    VectorStoreChunkingStrategy:
      oneOf:
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
      discriminator:
        propertyName: type
        mapping:
          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
    VectorStoreChunkingStrategyAuto:
      type: object
      properties:
        type:
          type: string
          const: auto
          default: auto
          description: >-
            Strategy type, always "auto" for automatic chunking
      additionalProperties: false
      required:
        - type
      title: VectorStoreChunkingStrategyAuto
      description: >-
        Automatic chunking strategy for vector store files.
    VectorStoreChunkingStrategyStatic:
      type: object
      properties:
        type:
          type: string
          const: static
          default: static
          description: >-
            Strategy type, always "static" for static chunking
        static:
          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
          description: >-
            Configuration parameters for the static chunking strategy
      additionalProperties: false
      required:
        - type
        - static
      title: VectorStoreChunkingStrategyStatic
      description: >-
        Static chunking strategy with configurable parameters.
    VectorStoreChunkingStrategyStaticConfig:
      type: object
      properties:
        chunk_overlap_tokens:
          type: integer
          default: 400
          description: >-
            Number of tokens to overlap between adjacent chunks
        max_chunk_size_tokens:
          type: integer
          default: 800
          description: >-
            Maximum number of tokens per chunk, must be between 100 and 4096
      additionalProperties: false
      required:
        - chunk_overlap_tokens
        - max_chunk_size_tokens
      title: VectorStoreChunkingStrategyStaticConfig
      description: >-
        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreFileBatchRequestWithExtraBody":
      type: object
      properties:
@ -10612,7 +10606,9 @@ components:
          description: >-
            Object type identifier for the search results page
        search_query:
-          type: string
+          type: array
          items:
            type: string
          description: >-
            The original search query that was executed
        data:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -24,13 +24,13 @@ classifiers = [
    "Topic :: Scientific/Engineering :: Information Analysis",
 ]
 dependencies = [
    "PyYAML>=6.0",
    "aiohttp",
    "fastapi>=0.115.0,<1.0",                          # server
    "fire",                                           # for MCP in LLS client
    "httpx",
    "jinja2>=3.1.6",
    "jsonschema",
    "llama-stack-client>=0.3.0",
    "openai>=2.5.0",
    "prompt-toolkit",
    "python-dotenv",
@ -52,11 +52,8 @@ dependencies = [
 ]
 [project.optional-dependencies]
-ui = [
+client = [
-    "streamlit",
+    "llama-stack-client>=0.3.0",  # Optional for library-only usage
    "pandas",
    "llama-stack-client>=0.3.0",
    "streamlit-option-menu",
 ]
 [dependency-groups]
@ -104,6 +101,7 @@ type_checking = [
    "lm-format-enforcer",
    "mcp",
    "ollama",
    "llama-stack-client>=0.3.0",
 ]
 # These are the dependencies required for running unit tests.
 unit = [
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -231,7 +231,8 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
        # Use a fixed port for the OTEL collector so the server can connect to it
        COLLECTOR_PORT=4317
        export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
-        export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
+        # Disabled: https://github.com/llamastack/llama-stack/issues/4089
        #export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
        export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf"
        export OTEL_BSP_SCHEDULE_DELAY="200"
        export OTEL_BSP_EXPORT_TIMEOUT="2000"
@ -337,7 +338,8 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    DOCKER_ENV_VARS=""
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
-    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
+    # Disabled: https://github.com/llamastack/llama-stack/issues/4089
    #DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_METRIC_EXPORT_INTERVAL=200"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_SCHEDULE_DELAY=200"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_EXPORT_TIMEOUT=2000"
@ -353,6 +355,10 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    [ -n "${OLLAMA_URL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"
    [ -n "${SAFETY_MODEL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e SAFETY_MODEL=$SAFETY_MODEL"
    if [[ "$TEST_SETUP" == "vllm" ]]; then
        DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e VLLM_URL=http://localhost:8000/v1"
    fi
    # Determine the actual image name (may have localhost/ prefix)
    IMAGE_NAME=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "distribution-$DISTRO:dev$" | head -1)
    if [[ -z "$IMAGE_NAME" ]]; then
@ -405,11 +411,6 @@ fi
 echo "=== Running Integration Tests ==="
 EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
 # Additional exclusions for vllm setup
 if [[ "$TEST_SETUP" == "vllm" ]]; then
    EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
 fi
 PYTEST_PATTERN="not( $EXCLUDE_TESTS )"
 if [[ -n "$TEST_PATTERN" ]]; then
    PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"
--- a/src/llama_stack/init.py
+++ b/src/llama_stack/init.py
@ -3,8 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.core.library_client import (  # noqa: F401
    AsyncLlamaStackAsLibraryClient,
    LlamaStackAsLibraryClient,
 )
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
@ -403,7 +403,7 @@ class OpenAIResponseText(BaseModel):
 # Must match type Literals of OpenAIResponseInputToolWebSearch below
-WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11"]
+WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11", "web_search_2025_08_26"]
@json_schema_type
@ -415,9 +415,12 @@ class OpenAIResponseInputToolWebSearch(BaseModel):
    """
    # Must match values of WebSearchToolTypes above
-    type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = (
+    type: (
-        "web_search"
+        Literal["web_search"]
-    )
+        | Literal["web_search_preview"]
        | Literal["web_search_preview_2025_03_11"]
        | Literal["web_search_2025_08_26"]
    ) = "web_search"
    # TODO: actually use search_context_size somewhere...
    search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
    # TODO: add user_location
--- a/src/llama_stack/apis/common/responses.py
+++ b/src/llama_stack/apis/common/responses.py
@ -34,3 +34,44 @@ class PaginatedResponse(BaseModel):
    data: list[dict[str, Any]]
    has_more: bool
    url: str | None = None
 # This is a short term solution to allow inference API to return metrics
 # The ideal way to do this is to have a way for all response types to include metrics
 # and all metric events logged to the telemetry API to be included with the response
 # To do this, we will need to augment all response types with a metrics field.
 # We have hit a blocker from stainless SDK that prevents us from doing this.
 # The blocker is that if we were to augment the response types that have a data field
 # in them like so
 # class ListModelsResponse(BaseModel):
 # metrics: Optional[List[MetricEvent]] = None
 # data: List[Models]
 # ...
 # The client SDK will need to access the data by using a .data field, which is not
 # ergonomic. Stainless SDK does support unwrapping the response type, but it
 # requires that the response type to only have a single field.
 # We will need a way in the client SDK to signal that the metrics are needed
 # and if they are needed, the client SDK has to return the full response type
 # without unwrapping it.
@json_schema_type
 class MetricInResponse(BaseModel):
    """A metric value included in API responses.
    :param metric: The name of the metric
    :param value: The numeric value of the metric
    :param unit: (Optional) The unit of measurement for the metric value
    """
    metric: str
    value: int | float
    unit: str | None = None
 class MetricResponseMixin(BaseModel):
    """Mixin class for API responses that can include metrics.
    :param metrics: (Optional) List of metrics associated with the API response
    """
    metrics: list[MetricInResponse] | None = None
--- a/src/llama_stack/apis/common/tracing.py
+++ b/src/llama_stack/apis/common/tracing.py
@ -0,0 +1,22 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 def telemetry_traceable(cls):
    """
    Mark a protocol for automatic tracing when telemetry is enabled.
    This is a metadata-only decorator with no dependencies on core.
    Actual tracing is applied by core routers at runtime if telemetry is enabled.
    Usage:
        @runtime_checkable
        @telemetry_traceable
        class MyProtocol(Protocol):
            ...
    """
    cls.__marked_for_tracing__ = True
    return cls
--- a/src/llama_stack/apis/conversations/conversations.py
+++ b/src/llama_stack/apis/conversations/conversations.py
@ -20,8 +20,8 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseOutputMessageWebSearchToolCall,
 )
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
 Metadata = dict[str, str]
@ -157,7 +157,7 @@ class ConversationItemDeletedResource(BaseModel):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Conversations(Protocol):
    """Conversations
--- a/src/llama_stack/apis/files/files.py
+++ b/src/llama_stack/apis/files/files.py
@ -11,8 +11,8 @@ from fastapi import File, Form, Response, UploadFile
 from pydantic import BaseModel, Field
 from llama_stack.apis.common.responses import Order
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@ -102,7 +102,7 @@ class OpenAIFileDeleteResponse(BaseModel):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Files(Protocol):
    """Files
--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
@ -19,11 +19,10 @@ from pydantic import BaseModel, Field, field_validator
 from typing_extensions import TypedDict
 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
-from llama_stack.apis.common.responses import Order
+from llama_stack.apis.common.responses import MetricResponseMixin, Order
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.models import Model
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
 from llama_stack.core.telemetry.telemetry import MetricResponseMixin
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    StopReason,
@ -1160,7 +1159,7 @@ class OpenAIEmbeddingsRequestWithExtraBody(BaseModel, extra="allow"):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class InferenceProvider(Protocol):
    """
    This protocol defines the interface that should be implemented by all inference providers.
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
@ -9,9 +9,9 @@ from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@ -105,7 +105,7 @@ class OpenAIListModelsResponse(BaseModel):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Models(Protocol):
    async def list_models(self) -> ListModelsResponse:
        """List all models.
--- a/src/llama_stack/apis/prompts/prompts.py
+++ b/src/llama_stack/apis/prompts/prompts.py
@ -10,8 +10,8 @@ from typing import Protocol, runtime_checkable
 from pydantic import BaseModel, Field, field_validator, model_validator
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@ -92,7 +92,7 @@ class ListPromptsResponse(BaseModel):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Prompts(Protocol):
    """Prompts
--- a/src/llama_stack/apis/safety/safety.py
+++ b/src/llama_stack/apis/safety/safety.py
@ -9,10 +9,10 @@ from typing import Any, Protocol, runtime_checkable
 from pydantic import BaseModel, Field
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.inference import OpenAIMessageParam
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@ -94,7 +94,7 @@ class ShieldStore(Protocol):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Safety(Protocol):
    """Safety
--- a/src/llama_stack/apis/shields/shields.py
+++ b/src/llama_stack/apis/shields/shields.py
@ -8,9 +8,9 @@ from typing import Any, Literal, Protocol, runtime_checkable
 from pydantic import BaseModel
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@ -48,7 +48,7 @@ class ListShieldsResponse(BaseModel):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Shields(Protocol):
    @webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1)
    async def list_shields(self) -> ListShieldsResponse:
--- a/src/llama_stack/apis/tools/tools.py
+++ b/src/llama_stack/apis/tools/tools.py
@ -11,9 +11,9 @@ from pydantic import BaseModel
 from typing_extensions import runtime_checkable
 from llama_stack.apis.common.content_types import URL, InterleavedContent
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
@ -107,7 +107,7 @@ class ListToolDefsResponse(BaseModel):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class ToolGroups(Protocol):
    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1)
    async def register_tool_group(
@ -189,7 +189,7 @@ class SpecialToolGroup(Enum):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class ToolRuntime(Protocol):
    tool_store: ToolStore | None = None
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -13,10 +13,10 @@ from typing import Annotated, Any, Literal, Protocol, runtime_checkable
 from fastapi import Body
 from pydantic import BaseModel, Field
 from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.apis.version import LLAMA_STACK_API_V1
 from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 from llama_stack.strong_typing.schema import register_schema
@ -260,7 +260,7 @@ class VectorStoreSearchResponsePage(BaseModel):
    """
    object: str = "vector_store.search_results.page"
-    search_query: str
+    search_query: list[str]
    data: list[VectorStoreSearchResponse]
    has_more: bool = False
    next_page: str | None = None
@ -478,7 +478,7 @@ class OpenAICreateVectorStoreRequestWithExtraBody(BaseModel, extra="allow"):
    name: str | None = None
    file_ids: list[str] | None = None
    expires_after: dict[str, Any] | None = None
-    chunking_strategy: dict[str, Any] | None = None
+    chunking_strategy: VectorStoreChunkingStrategy | None = None
    metadata: dict[str, Any] | None = None
@ -502,7 +502,7 @@ class VectorStoreTable(Protocol):
@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class VectorIO(Protocol):
    vector_store_table: VectorStoreTable | None = None
--- a/src/llama_stack/core/library_client.py
+++ b/src/llama_stack/core/library_client.py
@ -18,14 +18,21 @@ from typing import Any, TypeVar, Union, get_args, get_origin
 import httpx
 import yaml
 from fastapi import Response as FastAPIResponse
-from llama_stack_client import (
+
-    NOT_GIVEN,
+try:
-    APIResponse,
+    from llama_stack_client import (
-    AsyncAPIResponse,
+        NOT_GIVEN,
-    AsyncLlamaStackClient,
+        APIResponse,
-    AsyncStream,
+        AsyncAPIResponse,
-    LlamaStackClient,
+        AsyncLlamaStackClient,
-)
+        AsyncStream,
        LlamaStackClient,
    )
 except ImportError as e:
    raise ImportError(
        "llama-stack-client is not installed. Please install it with `uv pip install llama-stack[client]`."
    ) from e
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
 from termcolor import cprint
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -397,6 +397,18 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config
    # Apply tracing if telemetry is enabled and any base class has __marked_for_tracing__ marker
    if run_config.telemetry.enabled:
        traced_classes = [
            base for base in reversed(impl.__class__.__mro__) if getattr(base, "__marked_for_tracing__", False)
        ]
        if traced_classes:
            from llama_stack.core.telemetry.trace_protocol import trace_protocol
            for cls in traced_classes:
                trace_protocol(cls)
    protocols = api_protocol_map_for_compliance_check(run_config)
    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
--- a/src/llama_stack/core/routers/init.py
+++ b/src/llama_stack/core/routers/init.py
@ -45,6 +45,7 @@ async def get_routing_table_impl(
        raise ValueError(f"API {api.value} not found in router map")
    impl = api_to_tables[api.value](impls_by_provider_id, dist_registry, policy)
    await impl.initialize()
    return impl
@ -92,5 +93,6 @@ async def get_auto_router_impl(
        api_to_dep_impl["safety_config"] = run_config.safety
    impl = api_to_routers[api.value](routing_table, **api_to_dep_impl)
    await impl.initialize()
    return impl
--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@ -190,7 +190,7 @@ class InferenceRouter(Inference):
        response = await provider.openai_completion(params)
        response.model = request_model_id
-        if self.telemetry_enabled:
+        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
@ -253,7 +253,7 @@ class InferenceRouter(Inference):
        if self.store:
            asyncio.create_task(self.store.store_chat_completion(response, params.messages))
-        if self.telemetry_enabled:
+        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
--- a/src/llama_stack/core/routers/vector_io.py
+++ b/src/llama_stack/core/routers/vector_io.py
@ -20,6 +20,8 @@ from llama_stack.apis.vector_io import (
    SearchRankingOptions,
    VectorIO,
    VectorStoreChunkingStrategy,
    VectorStoreChunkingStrategyStatic,
    VectorStoreChunkingStrategyStaticConfig,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
    VectorStoreFileContentsResponse,
@ -167,6 +169,13 @@ class VectorIORouter(VectorIO):
        if embedding_dimension is not None:
            params.model_extra["embedding_dimension"] = embedding_dimension
        # Set chunking strategy explicitly if not provided
        if params.chunking_strategy is None or params.chunking_strategy.type == "auto":
            # actualize the chunking strategy to static
            params.chunking_strategy = VectorStoreChunkingStrategyStatic(
                static=VectorStoreChunkingStrategyStaticConfig()
            )
        return await provider.openai_create_vector_store(params)
    async def openai_list_vector_stores(
@ -283,6 +292,8 @@ class VectorIORouter(VectorIO):
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
    ) -> VectorStoreFileObject:
        logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
        if chunking_strategy is None or chunking_strategy.type == "auto":
            chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
        provider = await self.routing_table.get_provider_impl(vector_store_id)
        return await provider.openai_attach_file_to_vector_store(
            vector_store_id=vector_store_id,
--- a/src/llama_stack/core/telemetry/telemetry.py
+++ b/src/llama_stack/core/telemetry/telemetry.py
@ -163,47 +163,6 @@ class MetricEvent(EventCommon):
    unit: str
@json_schema_type
 class MetricInResponse(BaseModel):
    """A metric value included in API responses.
    :param metric: The name of the metric
    :param value: The numeric value of the metric
    :param unit: (Optional) The unit of measurement for the metric value
    """
    metric: str
    value: int | float
    unit: str | None = None
 # This is a short term solution to allow inference API to return metrics
 # The ideal way to do this is to have a way for all response types to include metrics
 # and all metric events logged to the telemetry API to be included with the response
 # To do this, we will need to augment all response types with a metrics field.
 # We have hit a blocker from stainless SDK that prevents us from doing this.
 # The blocker is that if we were to augment the response types that have a data field
 # in them like so
 # class ListModelsResponse(BaseModel):
 # metrics: Optional[List[MetricEvent]] = None
 # data: List[Models]
 # ...
 # The client SDK will need to access the data by using a .data field, which is not
 # ergonomic. Stainless SDK does support unwrapping the response type, but it
 # requires that the response type to only have a single field.
 # We will need a way in the client SDK to signal that the metrics are needed
 # and if they are needed, the client SDK has to return the full response type
 # without unwrapping it.
 class MetricResponseMixin(BaseModel):
    """Mixin class for API responses that can include metrics.
    :param metrics: (Optional) List of metrics associated with the API response
    """
    metrics: list[MetricInResponse] | None = None
@json_schema_type
 class StructuredLogType(Enum):
    """The type of structured log event payload.
--- a/src/llama_stack/core/telemetry/trace_protocol.py
+++ b/src/llama_stack/core/telemetry/trace_protocol.py
@ -129,6 +129,15 @@ def trace_protocol[T: type[Any]](cls: T) -> T:
        else:
            return sync_wrapper
    # Wrap methods on the class itself (for classes applied at runtime)
    # Skip if already wrapped (indicated by __wrapped__ attribute)
    for name, method in vars(cls).items():
        if inspect.isfunction(method) and not name.startswith("_"):
            if not hasattr(method, "__wrapped__"):
                wrapped = trace_method(method)
                setattr(cls, name, wrapped)  # noqa: B010
    # Also set up __init_subclass__ for future subclasses
    original_init_subclass = cast(Callable[..., Any] | None, getattr(cls, "__init_subclass__", None))
    def __init_subclass__(cls_child: type[Any], **kwargs: Any) -> None:  # noqa: N807
--- a/src/llama_stack/core/ui/Containerfile
+++ b/src/llama_stack/core/ui/Containerfile
@ -1,11 +0,0 @@
 # More info on playground configuration can be found here:
 # https://llama-stack.readthedocs.io/en/latest/playground
 FROM python:3.12-slim
 WORKDIR /app
 COPY . /app/
 RUN /usr/local/bin/python -m pip install --upgrade pip && \
    /usr/local/bin/pip3 install -r requirements.txt
 EXPOSE 8501
 ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
--- a/src/llama_stack/core/ui/README.md
+++ b/src/llama_stack/core/ui/README.md
@ -1,50 +0,0 @@
 # (Experimental) LLama Stack UI
 ## Docker Setup
 :warning: This is a work in progress.
 ## Developer Setup
 1. Start up Llama Stack API server. More details [here](https://llamastack.github.io/latest/getting_started/index.htmll).
 ```
 llama stack list-deps together | xargs -L1 uv pip install
 llama stack run together
 ```
 2. (Optional) Register datasets and eval tasks as resources. If you want to run pre-configured evaluation flows (e.g. Evaluations (Generation + Scoring) Page).
 ```bash
 llama-stack-client datasets register \
 --dataset-id "mmlu" \
 --provider-id "huggingface" \
 --url "https://huggingface.co/datasets/llamastack/evals" \
 --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
 --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}'
 ```
 ```bash
 llama-stack-client benchmarks register \
 --eval-task-id meta-reference-mmlu \
 --provider-id meta-reference \
 --dataset-id mmlu \
 --scoring-functions basic::regex_parser_multiple_choice_answer
 ```
 3. Start Streamlit UI
 ```bash
 uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
 ```
 ## Environment Variables
 | Environment Variable       | Description                        | Default Value             |
 |----------------------------|------------------------------------|---------------------------|
 | LLAMA_STACK_ENDPOINT       | The endpoint for the Llama Stack   | http://localhost:8321     |
 | FIREWORKS_API_KEY          | API key for Fireworks provider     | (empty string)            |
 | TOGETHER_API_KEY           | API key for Together provider      | (empty string)            |
 | SAMBANOVA_API_KEY          | API key for SambaNova provider     | (empty string)            |
 | OPENAI_API_KEY             | API key for OpenAI provider        | (empty string)            |
--- a/src/llama_stack/core/ui/app.py
+++ b/src/llama_stack/core/ui/app.py
@ -1,55 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import streamlit as st
 def main():
    # Evaluation pages
    application_evaluation_page = st.Page(
        "page/evaluations/app_eval.py",
        title="Evaluations (Scoring)",
        icon="📊",
        default=False,
    )
    native_evaluation_page = st.Page(
        "page/evaluations/native_eval.py",
        title="Evaluations (Generation + Scoring)",
        icon="📊",
        default=False,
    )
    # Playground pages
    chat_page = st.Page("page/playground/chat.py", title="Chat", icon="💬", default=True)
    rag_page = st.Page("page/playground/rag.py", title="RAG", icon="💬", default=False)
    tool_page = st.Page("page/playground/tools.py", title="Tools", icon="🛠", default=False)
    # Distribution pages
    resources_page = st.Page("page/distribution/resources.py", title="Resources", icon="🔍", default=False)
    provider_page = st.Page(
        "page/distribution/providers.py",
        title="API Providers",
        icon="🔍",
        default=False,
    )
    pg = st.navigation(
        {
            "Playground": [
                chat_page,
                rag_page,
                tool_page,
                application_evaluation_page,
                native_evaluation_page,
            ],
            "Inspect": [provider_page, resources_page],
        },
        expanded=False,
    )
    pg.run()
 if __name__ == "__main__":
    main()
--- a/src/llama_stack/core/ui/modules/init.py
+++ b/src/llama_stack/core/ui/modules/init.py
@ -1,5 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/src/llama_stack/core/ui/modules/api.py
+++ b/src/llama_stack/core/ui/modules/api.py
@ -1,32 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 from llama_stack_client import LlamaStackClient
 class LlamaStackApi:
    def __init__(self):
        self.client = LlamaStackClient(
            base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:8321"),
            provider_data={
                "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
                "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),
                "sambanova_api_key": os.environ.get("SAMBANOVA_API_KEY", ""),
                "openai_api_key": os.environ.get("OPENAI_API_KEY", ""),
                "tavily_search_api_key": os.environ.get("TAVILY_SEARCH_API_KEY", ""),
            },
        )
    def run_scoring(self, row, scoring_function_ids: list[str], scoring_params: dict | None):
        """Run scoring on a single row"""
        if not scoring_params:
            scoring_params = dict.fromkeys(scoring_function_ids)
        return self.client.scoring.score(input_rows=[row], scoring_functions=scoring_params)
 llama_stack_api = LlamaStackApi()
--- a/src/llama_stack/core/ui/modules/utils.py
+++ b/src/llama_stack/core/ui/modules/utils.py
@ -1,42 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import base64
 import os
 import pandas as pd
 import streamlit as st
 def process_dataset(file):
    if file is None:
        return "No file uploaded", None
    try:
        # Determine file type and read accordingly
        file_ext = os.path.splitext(file.name)[1].lower()
        if file_ext == ".csv":
            df = pd.read_csv(file)
        elif file_ext in [".xlsx", ".xls"]:
            df = pd.read_excel(file)
        else:
            return "Unsupported file format. Please upload a CSV or Excel file.", None
        return df
    except Exception as e:
        st.error(f"Error processing file: {str(e)}")
        return None
 def data_url_from_file(file) -> str:
    file_content = file.getvalue()
    base64_content = base64.b64encode(file_content).decode("utf-8")
    mime_type = file.type
    data_url = f"data:{mime_type};base64,{base64_content}"
    return data_url
--- a/src/llama_stack/core/ui/page/init.py
+++ b/src/llama_stack/core/ui/page/init.py
@ -1,5 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/distribution/init.py
+++ b/src/llama_stack/core/ui/page/distribution/init.py
@ -1,5 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/distribution/datasets.py
+++ b/src/llama_stack/core/ui/page/distribution/datasets.py
@ -1,18 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import streamlit as st
 from llama_stack.core.ui.modules.api import llama_stack_api
 def datasets():
    st.header("Datasets")
    datasets_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.datasets.list()}
    if len(datasets_info) > 0:
        selected_dataset = st.selectbox("Select a dataset", list(datasets_info.keys()))
        st.json(datasets_info[selected_dataset], expanded=True)
--- a/src/llama_stack/core/ui/page/distribution/eval_tasks.py
+++ b/src/llama_stack/core/ui/page/distribution/eval_tasks.py
@ -1,20 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import streamlit as st
 from llama_stack.core.ui.modules.api import llama_stack_api
 def benchmarks():
    # Benchmarks Section
    st.header("Benchmarks")
    benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
    if len(benchmarks_info) > 0:
        selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
        st.json(benchmarks_info[selected_benchmark], expanded=True)
--- a/src/llama_stack/core/ui/page/distribution/models.py
+++ b/src/llama_stack/core/ui/page/distribution/models.py
@ -1,18 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import streamlit as st
 from llama_stack.core.ui.modules.api import llama_stack_api
 def models():
    # Models Section
    st.header("Models")
    models_info = {m.id: m.model_dump() for m in llama_stack_api.client.models.list()}
    selected_model = st.selectbox("Select a model", list(models_info.keys()))
    st.json(models_info[selected_model])
--- a/src/llama_stack/core/ui/page/distribution/providers.py
+++ b/src/llama_stack/core/ui/page/distribution/providers.py
@ -1,27 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import streamlit as st
 from llama_stack.core.ui.modules.api import llama_stack_api
 def providers():
    st.header("🔍 API Providers")
    apis_providers_lst = llama_stack_api.client.providers.list()
    api_to_providers = {}
    for api_provider in apis_providers_lst:
        if api_provider.api in api_to_providers:
            api_to_providers[api_provider.api].append(api_provider)
        else:
            api_to_providers[api_provider.api] = [api_provider]
    for api in api_to_providers.keys():
        st.markdown(f"###### {api}")
        st.dataframe([x.to_dict() for x in api_to_providers[api]], width=500)
 providers()
--- a/src/llama_stack/core/ui/page/distribution/resources.py
+++ b/src/llama_stack/core/ui/page/distribution/resources.py
@ -1,48 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from streamlit_option_menu import option_menu
 from llama_stack.core.ui.page.distribution.datasets import datasets
 from llama_stack.core.ui.page.distribution.eval_tasks import benchmarks
 from llama_stack.core.ui.page.distribution.models import models
 from llama_stack.core.ui.page.distribution.scoring_functions import scoring_functions
 from llama_stack.core.ui.page.distribution.shields import shields
 def resources_page():
    options = [
        "Models",
        "Shields",
        "Scoring Functions",
        "Datasets",
        "Benchmarks",
    ]
    icons = ["magic", "shield", "file-bar-graph", "database", "list-task"]
    selected_resource = option_menu(
        None,
        options,
        icons=icons,
        orientation="horizontal",
        styles={
            "nav-link": {
                "font-size": "12px",
            },
        },
    )
    if selected_resource == "Benchmarks":
        benchmarks()
    elif selected_resource == "Datasets":
        datasets()
    elif selected_resource == "Models":
        models()
    elif selected_resource == "Scoring Functions":
        scoring_functions()
    elif selected_resource == "Shields":
        shields()
 resources_page()
--- a/src/llama_stack/core/ui/page/distribution/scoring_functions.py
+++ b/src/llama_stack/core/ui/page/distribution/scoring_functions.py
@ -1,18 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import streamlit as st
 from llama_stack.core.ui.modules.api import llama_stack_api
 def scoring_functions():
    st.header("Scoring Functions")
    scoring_functions_info = {s.identifier: s.to_dict() for s in llama_stack_api.client.scoring_functions.list()}
    selected_scoring_function = st.selectbox("Select a scoring function", list(scoring_functions_info.keys()))
    st.json(scoring_functions_info[selected_scoring_function], expanded=True)
--- a/src/llama_stack/core/ui/page/distribution/shields.py
+++ b/src/llama_stack/core/ui/page/distribution/shields.py
@ -1,19 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import streamlit as st
 from llama_stack.core.ui.modules.api import llama_stack_api
 def shields():
    # Shields Section
    st.header("Shields")
    shields_info = {s.identifier: s.to_dict() for s in llama_stack_api.client.shields.list()}
    selected_shield = st.selectbox("Select a shield", list(shields_info.keys()))
    st.json(shields_info[selected_shield])
--- a/src/llama_stack/core/ui/page/evaluations/init.py
+++ b/src/llama_stack/core/ui/page/evaluations/init.py
@ -1,5 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/evaluations/app_eval.py
+++ b/src/llama_stack/core/ui/page/evaluations/app_eval.py
@ -1,143 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 import pandas as pd
 import streamlit as st
 from llama_stack.core.ui.modules.api import llama_stack_api
 from llama_stack.core.ui.modules.utils import process_dataset
 def application_evaluation_page():
    st.set_page_config(page_title="Evaluations (Scoring)", page_icon="🦙")
    st.title("📊 Evaluations (Scoring)")
    # File uploader
    uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"])
    if uploaded_file is None:
        st.error("No file uploaded")
        return
    # Process uploaded file
    df = process_dataset(uploaded_file)
    if df is None:
        st.error("Error processing file")
        return
    # Display dataset information
    st.success("Dataset loaded successfully!")
    # Display dataframe preview
    st.subheader("Dataset Preview")
    st.dataframe(df)
    # Select Scoring Functions to Run Evaluation On
    st.subheader("Select Scoring Functions")
    scoring_functions = llama_stack_api.client.scoring_functions.list()
    scoring_functions = {sf.identifier: sf for sf in scoring_functions}
    scoring_functions_names = list(scoring_functions.keys())
    selected_scoring_functions = st.multiselect(
        "Choose one or more scoring functions",
        options=scoring_functions_names,
        help="Choose one or more scoring functions.",
    )
    available_models = llama_stack_api.client.models.list()
    available_models = [m.identifier for m in available_models]
    scoring_params = {}
    if selected_scoring_functions:
        st.write("Selected:")
        for scoring_fn_id in selected_scoring_functions:
            scoring_fn = scoring_functions[scoring_fn_id]
            st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}")
            new_params = None
            if scoring_fn.params:
                new_params = {}
                for param_name, param_value in scoring_fn.params.to_dict().items():
                    if param_name == "type":
                        new_params[param_name] = param_value
                        continue
                    if param_name == "judge_model":
                        value = st.selectbox(
                            f"Select **{param_name}** for {scoring_fn_id}",
                            options=available_models,
                            index=0,
                            key=f"{scoring_fn_id}_{param_name}",
                        )
                        new_params[param_name] = value
                    else:
                        value = st.text_area(
                            f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format",
                            value=json.dumps(param_value, indent=2),
                            height=80,
                        )
                        try:
                            new_params[param_name] = json.loads(value)
                        except json.JSONDecodeError:
                            st.error(f"Invalid JSON for **{param_name}** in {scoring_fn_id}")
                st.json(new_params)
            scoring_params[scoring_fn_id] = new_params
        # Add run evaluation button & slider
        total_rows = len(df)
        num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows)
        if st.button("Run Evaluation"):
            progress_text = "Running evaluation..."
            progress_bar = st.progress(0, text=progress_text)
            rows = df.to_dict(orient="records")
            if num_rows < total_rows:
                rows = rows[:num_rows]
            # Create separate containers for progress text and results
            progress_text_container = st.empty()
            results_container = st.empty()
            output_res = {}
            for i, r in enumerate(rows):
                # Update progress
                progress = i / len(rows)
                progress_bar.progress(progress, text=progress_text)
                # Run evaluation for current row
                score_res = llama_stack_api.run_scoring(
                    r,
                    scoring_function_ids=selected_scoring_functions,
                    scoring_params=scoring_params,
                )
                for k in r.keys():
                    if k not in output_res:
                        output_res[k] = []
                    output_res[k].append(r[k])
                for fn_id in selected_scoring_functions:
                    if fn_id not in output_res:
                        output_res[fn_id] = []
                    output_res[fn_id].append(score_res.results[fn_id].score_rows[0])
                # Display current row results using separate containers
                progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})")
                results_container.json(
                    score_res.to_json(),
                    expanded=2,
                )
            progress_bar.progress(1.0, text="Evaluation complete!")
            # Display results in dataframe
            if output_res:
                output_df = pd.DataFrame(output_res)
                st.subheader("Evaluation Results")
                st.dataframe(output_df)
 application_evaluation_page()
--- a/src/llama_stack/core/ui/page/evaluations/native_eval.py
+++ b/src/llama_stack/core/ui/page/evaluations/native_eval.py
@ -1,253 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 import pandas as pd
 import streamlit as st
 from llama_stack.core.ui.modules.api import llama_stack_api
 def select_benchmark_1():
    # Select Benchmarks
    st.subheader("1. Choose An Eval Task")
    benchmarks = llama_stack_api.client.benchmarks.list()
    benchmarks = {et.identifier: et for et in benchmarks}
    benchmarks_names = list(benchmarks.keys())
    selected_benchmark = st.selectbox(
        "Choose an eval task.",
        options=benchmarks_names,
        help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
    )
    with st.expander("View Eval Task"):
        st.json(benchmarks[selected_benchmark], expanded=True)
    st.session_state["selected_benchmark"] = selected_benchmark
    st.session_state["benchmarks"] = benchmarks
    if st.button("Confirm", key="confirm_1"):
        st.session_state["selected_benchmark_1_next"] = True
 def define_eval_candidate_2():
    if not st.session_state.get("selected_benchmark_1_next", None):
        return
    st.subheader("2. Define Eval Candidate")
    st.info(
        """
        Define the configurations for the evaluation candidate model or agent used for generation.
        Select "model" if you want to run generation with inference API, or "agent" if you want to run generation with agent API through specifying AgentConfig.
        """
    )
    with st.expander("Define Eval Candidate", expanded=True):
        # Define Eval Candidate
        candidate_type = st.radio("Candidate Type", ["model", "agent"])
        available_models = llama_stack_api.client.models.list()
        available_models = [model.identifier for model in available_models]
        selected_model = st.selectbox(
            "Choose a model",
            available_models,
            index=0,
        )
        # Sampling Parameters
        st.markdown("##### Sampling Parameters")
        temperature = st.slider(
            "Temperature",
            min_value=0.0,
            max_value=1.0,
            value=0.0,
            step=0.1,
            help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
        )
        top_p = st.slider(
            "Top P",
            min_value=0.0,
            max_value=1.0,
            value=0.95,
            step=0.1,
        )
        max_tokens = st.slider(
            "Max Tokens",
            min_value=0,
            max_value=4096,
            value=512,
            step=1,
            help="The maximum number of tokens to generate",
        )
        repetition_penalty = st.slider(
            "Repetition Penalty",
            min_value=1.0,
            max_value=2.0,
            value=1.0,
            step=0.1,
            help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
        )
        if candidate_type == "model":
            if temperature > 0.0:
                strategy = {
                    "type": "top_p",
                    "temperature": temperature,
                    "top_p": top_p,
                }
            else:
                strategy = {"type": "greedy"}
            eval_candidate = {
                "type": "model",
                "model": selected_model,
                "sampling_params": {
                    "strategy": strategy,
                    "max_tokens": max_tokens,
                    "repetition_penalty": repetition_penalty,
                },
            }
        elif candidate_type == "agent":
            system_prompt = st.text_area(
                "System Prompt",
                value="You are a helpful AI assistant.",
                help="Initial instructions given to the AI to set its behavior and context",
            )
            tools_json = st.text_area(
                "Tools Configuration (JSON)",
                value=json.dumps(
                    [
                        {
                            "type": "brave_search",
                            "engine": "brave",
                            "api_key": "ENTER_BRAVE_API_KEY_HERE",
                        }
                    ]
                ),
                help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
                height=200,
            )
            try:
                tools = json.loads(tools_json)
            except json.JSONDecodeError:
                st.error("Invalid JSON format for tools configuration")
                tools = []
            eval_candidate = {
                "type": "agent",
                "config": {
                    "model": selected_model,
                    "instructions": system_prompt,
                    "tools": tools,
                    "tool_choice": "auto",
                    "tool_prompt_format": "json",
                    "input_shields": [],
                    "output_shields": [],
                    "enable_session_persistence": False,
                },
            }
        st.session_state["eval_candidate"] = eval_candidate
    if st.button("Confirm", key="confirm_2"):
        st.session_state["selected_eval_candidate_2_next"] = True
 def run_evaluation_3():
    if not st.session_state.get("selected_eval_candidate_2_next", None):
        return
    st.subheader("3. Run Evaluation")
    # Add info box to explain configurations being used
    st.info(
        """
        Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
        """
    )
    selected_benchmark = st.session_state["selected_benchmark"]
    benchmarks = st.session_state["benchmarks"]
    eval_candidate = st.session_state["eval_candidate"]
    dataset_id = benchmarks[selected_benchmark].dataset_id
    rows = llama_stack_api.client.datasets.iterrows(
        dataset_id=dataset_id,
    )
    total_rows = len(rows.data)
    # Add number of examples control
    num_rows = st.number_input(
        "Number of Examples to Evaluate",
        min_value=1,
        max_value=total_rows,
        value=5,
        help="Number of examples from the dataset to evaluate. ",
    )
    benchmark_config = {
        "type": "benchmark",
        "eval_candidate": eval_candidate,
        "scoring_params": {},
    }
    with st.expander("View Evaluation Task", expanded=True):
        st.json(benchmarks[selected_benchmark], expanded=True)
    with st.expander("View Evaluation Task Configuration", expanded=True):
        st.json(benchmark_config, expanded=True)
    # Add run button and handle evaluation
    if st.button("Run Evaluation"):
        progress_text = "Running evaluation..."
        progress_bar = st.progress(0, text=progress_text)
        rows = rows.data
        if num_rows < total_rows:
            rows = rows[:num_rows]
        # Create separate containers for progress text and results
        progress_text_container = st.empty()
        results_container = st.empty()
        output_res = {}
        for i, r in enumerate(rows):
            # Update progress
            progress = i / len(rows)
            progress_bar.progress(progress, text=progress_text)
            # Run evaluation for current row
            eval_res = llama_stack_api.client.eval.evaluate_rows(
                benchmark_id=selected_benchmark,
                input_rows=[r],
                scoring_functions=benchmarks[selected_benchmark].scoring_functions,
                benchmark_config=benchmark_config,
            )
            for k in r.keys():
                if k not in output_res:
                    output_res[k] = []
                output_res[k].append(r[k])
            for k in eval_res.generations[0].keys():
                if k not in output_res:
                    output_res[k] = []
                output_res[k].append(eval_res.generations[0][k])
            for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
                if scoring_fn not in output_res:
                    output_res[scoring_fn] = []
                output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
            progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})")
            results_container.json(eval_res, expanded=2)
        progress_bar.progress(1.0, text="Evaluation complete!")
        # Display results in dataframe
        if output_res:
            output_df = pd.DataFrame(output_res)
            st.subheader("Evaluation Results")
            st.dataframe(output_df)
 def native_evaluation_page():
    st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
    st.title("📊 Evaluations (Generation + Scoring)")
    select_benchmark_1()
    define_eval_candidate_2()
    run_evaluation_3()
 native_evaluation_page()
--- a/src/llama_stack/core/ui/page/playground/init.py
+++ b/src/llama_stack/core/ui/page/playground/init.py
@ -1,5 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/playground/chat.py
+++ b/src/llama_stack/core/ui/page/playground/chat.py
@ -1,134 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import streamlit as st
 from llama_stack.core.ui.modules.api import llama_stack_api
 # Sidebar configurations
 with st.sidebar:
    st.header("Configuration")
    available_models = llama_stack_api.client.models.list()
    available_models = [
        model.id
        for model in available_models
        if model.custom_metadata and model.custom_metadata.get("model_type") == "llm"
    ]
    selected_model = st.selectbox(
        "Choose a model",
        available_models,
        index=0,
    )
    temperature = st.slider(
        "Temperature",
        min_value=0.0,
        max_value=1.0,
        value=0.0,
        step=0.1,
        help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
    )
    top_p = st.slider(
        "Top P",
        min_value=0.0,
        max_value=1.0,
        value=0.95,
        step=0.1,
    )
    max_tokens = st.slider(
        "Max Tokens",
        min_value=0,
        max_value=4096,
        value=512,
        step=1,
        help="The maximum number of tokens to generate",
    )
    repetition_penalty = st.slider(
        "Repetition Penalty",
        min_value=1.0,
        max_value=2.0,
        value=1.0,
        step=0.1,
        help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
    )
    stream = st.checkbox("Stream", value=True)
    system_prompt = st.text_area(
        "System Prompt",
        value="You are a helpful AI assistant.",
        help="Initial instructions given to the AI to set its behavior and context",
    )
    # Add clear chat button to sidebar
    if st.button("Clear Chat", use_container_width=True):
        st.session_state.messages = []
        st.rerun()
 # Main chat interface
 st.title("🦙 Chat")
 # Initialize chat history
 if "messages" not in st.session_state:
    st.session_state.messages = []
 # Display chat messages
 for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])
 # Chat input
 if prompt := st.chat_input("Example: What is Llama Stack?"):
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": prompt})
    # Display user message
    with st.chat_message("user"):
        st.markdown(prompt)
    # Display assistant response
    with st.chat_message("assistant"):
        message_placeholder = st.empty()
        full_response = ""
        if temperature > 0.0:
            strategy = {
                "type": "top_p",
                "temperature": temperature,
                "top_p": top_p,
            }
        else:
            strategy = {"type": "greedy"}
        response = llama_stack_api.client.inference.chat_completion(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt},
            ],
            model_id=selected_model,
            stream=stream,
            sampling_params={
                "strategy": strategy,
                "max_tokens": max_tokens,
                "repetition_penalty": repetition_penalty,
            },
        )
        if stream:
            for chunk in response:
                if chunk.event.event_type == "progress":
                    full_response += chunk.event.delta.text
                message_placeholder.markdown(full_response + "▌")
            message_placeholder.markdown(full_response)
        else:
            full_response = response.completion_message.content
            message_placeholder.markdown(full_response)
        st.session_state.messages.append({"role": "assistant", "content": full_response})
--- a/src/llama_stack/core/ui/page/playground/tools.py
+++ b/src/llama_stack/core/ui/page/playground/tools.py
@ -1,352 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import enum
 import json
 import uuid
 import streamlit as st
 from llama_stack_client import Agent
 from llama_stack_client.lib.agents.react.agent import ReActAgent
 from llama_stack_client.lib.agents.react.tool_parser import ReActOutput
 from llama_stack.core.ui.modules.api import llama_stack_api
 class AgentType(enum.Enum):
    REGULAR = "Regular"
    REACT = "ReAct"
 def tool_chat_page():
    st.title("🛠 Tools")
    client = llama_stack_api.client
    models = client.models.list()
    model_list = [model.identifier for model in models if model.api_model_type == "llm"]
    tool_groups = client.toolgroups.list()
    tool_groups_list = [tool_group.identifier for tool_group in tool_groups]
    mcp_tools_list = [tool for tool in tool_groups_list if tool.startswith("mcp::")]
    builtin_tools_list = [tool for tool in tool_groups_list if not tool.startswith("mcp::")]
    selected_vector_stores = []
    def reset_agent():
        st.session_state.clear()
        st.cache_resource.clear()
    with st.sidebar:
        st.title("Configuration")
        st.subheader("Model")
        model = st.selectbox(label="Model", options=model_list, on_change=reset_agent, label_visibility="collapsed")
        st.subheader("Available ToolGroups")
        toolgroup_selection = st.pills(
            label="Built-in tools",
            options=builtin_tools_list,
            selection_mode="multi",
            on_change=reset_agent,
            format_func=lambda tool: "".join(tool.split("::")[1:]),
            help="List of built-in tools from your llama stack server.",
        )
        if "builtin::rag" in toolgroup_selection:
            vector_stores = llama_stack_api.client.vector_stores.list() or []
            if not vector_stores:
                st.info("No vector databases available for selection.")
            vector_stores = [vector_store.identifier for vector_store in vector_stores]
            selected_vector_stores = st.multiselect(
                label="Select Document Collections to use in RAG queries",
                options=vector_stores,
                on_change=reset_agent,
            )
        mcp_selection = st.pills(
            label="MCP Servers",
            options=mcp_tools_list,
            selection_mode="multi",
            on_change=reset_agent,
            format_func=lambda tool: "".join(tool.split("::")[1:]),
            help="List of MCP servers registered to your llama stack server.",
        )
        toolgroup_selection.extend(mcp_selection)
        grouped_tools = {}
        total_tools = 0
        for toolgroup_id in toolgroup_selection:
            tools = client.tools.list(toolgroup_id=toolgroup_id)
            grouped_tools[toolgroup_id] = [tool.name for tool in tools]
            total_tools += len(tools)
        st.markdown(f"Active Tools: 🛠 {total_tools}")
        for group_id, tools in grouped_tools.items():
            with st.expander(f"🔧 Tools from `{group_id}`"):
                for idx, tool in enumerate(tools, start=1):
                    st.markdown(f"{idx}. `{tool.split(':')[-1]}`")
        st.subheader("Agent Configurations")
        st.subheader("Agent Type")
        agent_type = st.radio(
            label="Select Agent Type",
            options=["Regular", "ReAct"],
            on_change=reset_agent,
        )
        if agent_type == "ReAct":
            agent_type = AgentType.REACT
        else:
            agent_type = AgentType.REGULAR
        max_tokens = st.slider(
            "Max Tokens",
            min_value=0,
            max_value=4096,
            value=512,
            step=64,
            help="The maximum number of tokens to generate",
            on_change=reset_agent,
        )
    for i, tool_name in enumerate(toolgroup_selection):
        if tool_name == "builtin::rag":
            tool_dict = dict(
                name="builtin::rag",
                args={
                    "vector_store_ids": list(selected_vector_stores),
                },
            )
            toolgroup_selection[i] = tool_dict
    @st.cache_resource
    def create_agent():
        if "agent_type" in st.session_state and st.session_state.agent_type == AgentType.REACT:
            return ReActAgent(
                client=client,
                model=model,
                tools=toolgroup_selection,
                response_format={
                    "type": "json_schema",
                    "json_schema": ReActOutput.model_json_schema(),
                },
                sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
            )
        else:
            return Agent(
                client,
                model=model,
                instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
                tools=toolgroup_selection,
                sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
            )
    st.session_state.agent_type = agent_type
    agent = create_agent()
    if "agent_session_id" not in st.session_state:
        st.session_state["agent_session_id"] = agent.create_session(session_name=f"tool_demo_{uuid.uuid4()}")
    session_id = st.session_state["agent_session_id"]
    if "messages" not in st.session_state:
        st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]
    for msg in st.session_state.messages:
        with st.chat_message(msg["role"]):
            st.markdown(msg["content"])
    if prompt := st.chat_input(placeholder=""):
        with st.chat_message("user"):
            st.markdown(prompt)
        st.session_state.messages.append({"role": "user", "content": prompt})
        turn_response = agent.create_turn(
            session_id=session_id,
            messages=[{"role": "user", "content": prompt}],
            stream=True,
        )
        def response_generator(turn_response):
            if st.session_state.get("agent_type") == AgentType.REACT:
                return _handle_react_response(turn_response)
            else:
                return _handle_regular_response(turn_response)
        def _handle_react_response(turn_response):
            current_step_content = ""
            final_answer = None
            tool_results = []
            for response in turn_response:
                if not hasattr(response.event, "payload"):
                    yield (
                        "\n\n🚨 :red[_Llama Stack server Error:_]\n"
                        "The response received is missing an expected `payload` attribute.\n"
                        "This could indicate a malformed response or an internal issue within the server.\n\n"
                        f"Error details: {response}"
                    )
                    return
                payload = response.event.payload
                if payload.event_type == "step_progress" and hasattr(payload.delta, "text"):
                    current_step_content += payload.delta.text
                    continue
                if payload.event_type == "step_complete":
                    step_details = payload.step_details
                    if step_details.step_type == "inference":
                        yield from _process_inference_step(current_step_content, tool_results, final_answer)
                        current_step_content = ""
                    elif step_details.step_type == "tool_execution":
                        tool_results = _process_tool_execution(step_details, tool_results)
                        current_step_content = ""
                    else:
                        current_step_content = ""
            if not final_answer and tool_results:
                yield from _format_tool_results_summary(tool_results)
        def _process_inference_step(current_step_content, tool_results, final_answer):
            try:
                react_output_data = json.loads(current_step_content)
                thought = react_output_data.get("thought")
                action = react_output_data.get("action")
                answer = react_output_data.get("answer")
                if answer and answer != "null" and answer is not None:
                    final_answer = answer
                if thought:
                    with st.expander("🤔 Thinking...", expanded=False):
                        st.markdown(f":grey[__{thought}__]")
                if action and isinstance(action, dict):
                    tool_name = action.get("tool_name")
                    tool_params = action.get("tool_params")
                    with st.expander(f'🛠 Action: Using tool "{tool_name}"', expanded=False):
                        st.json(tool_params)
                if answer and answer != "null" and answer is not None:
                    yield f"\n\n✅ **Final Answer:**\n{answer}"
            except json.JSONDecodeError:
                yield f"\n\nFailed to parse ReAct step content:\n```json\n{current_step_content}\n```"
            except Exception as e:
                yield f"\n\nFailed to process ReAct step: {e}\n```json\n{current_step_content}\n```"
            return final_answer
        def _process_tool_execution(step_details, tool_results):
            try:
                if hasattr(step_details, "tool_responses") and step_details.tool_responses:
                    for tool_response in step_details.tool_responses:
                        tool_name = tool_response.tool_name
                        content = tool_response.content
                        tool_results.append((tool_name, content))
                        with st.expander(f'⚙️ Observation (Result from "{tool_name}")', expanded=False):
                            try:
                                parsed_content = json.loads(content)
                                st.json(parsed_content)
                            except json.JSONDecodeError:
                                st.code(content, language=None)
                else:
                    with st.expander("⚙️ Observation", expanded=False):
                        st.markdown(":grey[_Tool execution step completed, but no response data found._]")
            except Exception as e:
                with st.expander("⚙️ Error in Tool Execution", expanded=False):
                    st.markdown(f":red[_Error processing tool execution: {str(e)}_]")
            return tool_results
        def _format_tool_results_summary(tool_results):
            yield "\n\n**Here's what I found:**\n"
            for tool_name, content in tool_results:
                try:
                    parsed_content = json.loads(content)
                    if tool_name == "web_search" and "top_k" in parsed_content:
                        yield from _format_web_search_results(parsed_content)
                    elif "results" in parsed_content and isinstance(parsed_content["results"], list):
                        yield from _format_results_list(parsed_content["results"])
                    elif isinstance(parsed_content, dict) and len(parsed_content) > 0:
                        yield from _format_dict_results(parsed_content)
                    elif isinstance(parsed_content, list) and len(parsed_content) > 0:
                        yield from _format_list_results(parsed_content)
                except json.JSONDecodeError:
                    yield f"\n**{tool_name}** was used but returned complex data. Check the observation for details.\n"
                except (TypeError, AttributeError, KeyError, IndexError) as e:
                    print(f"Error processing {tool_name} result: {type(e).__name__}: {e}")
        def _format_web_search_results(parsed_content):
            for i, result in enumerate(parsed_content["top_k"], 1):
                if i <= 3:
                    title = result.get("title", "Untitled")
                    url = result.get("url", "")
                    content_text = result.get("content", "").strip()
                    yield f"\n- **{title}**\n  {content_text}\n  [Source]({url})\n"
        def _format_results_list(results):
            for i, result in enumerate(results, 1):
                if i <= 3:
                    if isinstance(result, dict):
                        name = result.get("name", result.get("title", "Result " + str(i)))
                        description = result.get("description", result.get("content", result.get("summary", "")))
                        yield f"\n- **{name}**\n  {description}\n"
                    else:
                        yield f"\n- {result}\n"
        def _format_dict_results(parsed_content):
            yield "\n```\n"
            for key, value in list(parsed_content.items())[:5]:
                if isinstance(value, str) and len(value) < 100:
                    yield f"{key}: {value}\n"
                else:
                    yield f"{key}: [Complex data]\n"
            yield "```\n"
        def _format_list_results(parsed_content):
            yield "\n"
            for _, item in enumerate(parsed_content[:3], 1):
                if isinstance(item, str):
                    yield f"- {item}\n"
                elif isinstance(item, dict) and "text" in item:
                    yield f"- {item['text']}\n"
                elif isinstance(item, dict) and len(item) > 0:
                    first_value = next(iter(item.values()))
                    if isinstance(first_value, str) and len(first_value) < 100:
                        yield f"- {first_value}\n"
        def _handle_regular_response(turn_response):
            for response in turn_response:
                if hasattr(response.event, "payload"):
                    print(response.event.payload)
                    if response.event.payload.event_type == "step_progress":
                        if hasattr(response.event.payload.delta, "text"):
                            yield response.event.payload.delta.text
                    if response.event.payload.event_type == "step_complete":
                        if response.event.payload.step_details.step_type == "tool_execution":
                            if response.event.payload.step_details.tool_calls:
                                tool_name = str(response.event.payload.step_details.tool_calls[0].tool_name)
                                yield f'\n\n🛠 :grey[_Using "{tool_name}" tool:_]\n\n'
                            else:
                                yield "No tool_calls present in step_details"
                else:
                    yield f"Error occurred in the Llama Stack Cluster: {response}"
        with st.chat_message("assistant"):
            response_content = st.write_stream(response_generator(turn_response))
        st.session_state.messages.append({"role": "assistant", "content": response_content})
 tool_chat_page()
--- a/src/llama_stack/core/ui/requirements.txt
+++ b/src/llama_stack/core/ui/requirements.txt
@ -1,5 +0,0 @@
 llama-stack>=0.2.1
 llama-stack-client>=0.2.1
 pandas
 streamlit
 streamlit-option-menu
--- a/src/llama_stack/core/utils/config_resolution.py
+++ b/src/llama_stack/core/utils/config_resolution.py
@ -52,7 +52,17 @@ def resolve_config_or_distro(
            logger.debug(f"Using distribution: {distro_config}")
            return distro_config
-    # Strategy 3: Try as built distribution name
+    # Strategy 3: Try as distro config path (if no .yaml extension and contains a slash)
    # eg: starter::run-with-postgres-store.yaml
    # Use :: to avoid slash and confusion with a filesystem path
    if "::" in config_or_distro:
        distro_name, config_name = config_or_distro.split("::")
        distro_config = _get_distro_config_path(distro_name, config_name)
        if distro_config.exists():
            logger.info(f"Using distribution: {distro_config}")
            return distro_config
    # Strategy 4: Try as built distribution name
    distrib_config = DISTRIBS_BASE_DIR / f"llamastack-{config_or_distro}" / f"{config_or_distro}-{mode}.yaml"
    if distrib_config.exists():
        logger.debug(f"Using built distribution: {distrib_config}")
@ -63,13 +73,15 @@ def resolve_config_or_distro(
        logger.debug(f"Using built distribution: {distrib_config}")
        return distrib_config
-    # Strategy 4: Failed - provide helpful error
+    # Strategy 5: Failed - provide helpful error
    raise ValueError(_format_resolution_error(config_or_distro, mode))
-def _get_distro_config_path(distro_name: str, mode: Mode) -> Path:
+def _get_distro_config_path(distro_name: str, mode: str) -> Path:
    """Get the config file path for a distro."""
-    return DISTRO_DIR / distro_name / f"{mode}.yaml"
+    if not mode.endswith(".yaml"):
        mode = f"{mode}.yaml"
    return DISTRO_DIR / distro_name / mode
 def _format_resolution_error(config_or_distro: str, mode: Mode) -> str:
--- a/src/llama_stack/core/utils/exec.py
+++ b/src/llama_stack/core/utils/exec.py
@ -84,6 +84,15 @@ def run_command(command: list[str]) -> int:
            text=True,
            check=False,
        )
        # Print stdout and stderr if command failed
        if result.returncode != 0:
            log.error(f"Command {' '.join(command)} failed with returncode {result.returncode}")
            if result.stdout:
                log.error(f"STDOUT: {result.stdout}")
            if result.stderr:
                log.error(f"STDERR: {result.stderr}")
        return result.returncode
    except subprocess.SubprocessError as e:
        log.error(f"Subprocess error: {e}")
--- a/src/llama_stack/distributions/ci-tests/build.yaml
+++ b/src/llama_stack/distributions/ci-tests/build.yaml
@ -56,4 +56,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
 - psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/ci-tests/ci_tests.py
+++ b/src/llama_stack/distributions/ci-tests/ci_tests.py
@ -13,5 +13,6 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
 def get_distribution_template() -> DistributionTemplate:
    template = get_starter_distribution_template(name="ci-tests")
    template.description = "CI tests for Llama Stack"
    template.run_configs.pop("run-with-postgres-store.yaml", None)
    return template
--- a/src/llama_stack/distributions/ci-tests/run.yaml
+++ b/src/llama_stack/distributions/ci-tests/run.yaml
@ -46,6 +46,9 @@ providers:
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
    config:
      api_key: ${env.AWS_BEDROCK_API_KEY:=}
      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
--- a/src/llama_stack/distributions/postgres-demo/init.py
+++ b/src/llama_stack/distributions/postgres-demo/init.py
@ -1,7 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .postgres_demo import get_distribution_template  # noqa: F401
--- a/src/llama_stack/distributions/postgres-demo/build.yaml
+++ b/src/llama_stack/distributions/postgres-demo/build.yaml
@ -1,23 +0,0 @@
 version: 2
 distribution_spec:
  description: Quick start template for running Llama Stack with several popular providers
  providers:
    inference:
    - provider_type: remote::vllm
    - provider_type: inline::sentence-transformers
    vector_io:
    - provider_type: remote::chromadb
    safety:
    - provider_type: inline::llama-guard
    agents:
    - provider_type: inline::meta-reference
    tool_runtime:
    - provider_type: remote::brave-search
    - provider_type: remote::tavily-search
    - provider_type: inline::rag-runtime
    - provider_type: remote::model-context-protocol
 image_type: venv
 additional_pip_packages:
 - asyncpg
 - psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/postgres-demo/postgres_demo.py
+++ b/src/llama_stack/distributions/postgres-demo/postgres_demo.py
@ -1,125 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.apis.models import ModelType
 from llama_stack.core.datatypes import (
    BuildProvider,
    ModelInput,
    Provider,
    ShieldInput,
    ToolGroupInput,
 )
 from llama_stack.distributions.template import (
    DistributionTemplate,
    RunConfigSettings,
 )
 from llama_stack.providers.inline.inference.sentence_transformers import SentenceTransformersInferenceConfig
 from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
 from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
 from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
 def get_distribution_template() -> DistributionTemplate:
    inference_providers = [
        Provider(
            provider_id="vllm-inference",
            provider_type="remote::vllm",
            config=VLLMInferenceAdapterConfig.sample_run_config(
                url="${env.VLLM_URL:=http://localhost:8000/v1}",
            ),
        ),
    ]
    providers = {
        "inference": [
            BuildProvider(provider_type="remote::vllm"),
            BuildProvider(provider_type="inline::sentence-transformers"),
        ],
        "vector_io": [BuildProvider(provider_type="remote::chromadb")],
        "safety": [BuildProvider(provider_type="inline::llama-guard")],
        "agents": [BuildProvider(provider_type="inline::meta-reference")],
        "tool_runtime": [
            BuildProvider(provider_type="remote::brave-search"),
            BuildProvider(provider_type="remote::tavily-search"),
            BuildProvider(provider_type="inline::rag-runtime"),
            BuildProvider(provider_type="remote::model-context-protocol"),
        ],
    }
    name = "postgres-demo"
    vector_io_providers = [
        Provider(
            provider_id="${env.ENABLE_CHROMADB:+chromadb}",
            provider_type="remote::chromadb",
            config=ChromaVectorIOConfig.sample_run_config(
                f"~/.llama/distributions/{name}",
                url="${env.CHROMADB_URL:=}",
            ),
        ),
    ]
    default_tool_groups = [
        ToolGroupInput(
            toolgroup_id="builtin::websearch",
            provider_id="tavily-search",
        ),
        ToolGroupInput(
            toolgroup_id="builtin::rag",
            provider_id="rag-runtime",
        ),
    ]
    default_models = [
        ModelInput(
            model_id="${env.INFERENCE_MODEL}",
            provider_id="vllm-inference",
        )
    ]
    embedding_provider = Provider(
        provider_id="sentence-transformers",
        provider_type="inline::sentence-transformers",
        config=SentenceTransformersInferenceConfig.sample_run_config(),
    )
    embedding_model = ModelInput(
        model_id="nomic-embed-text-v1.5",
        provider_id=embedding_provider.provider_id,
        model_type=ModelType.embedding,
        metadata={
            "embedding_dimension": 768,
        },
    )
    return DistributionTemplate(
        name=name,
        distro_type="self_hosted",
        description="Quick start template for running Llama Stack with several popular providers",
        container_image=None,
        template_path=None,
        providers=providers,
        available_models_by_provider={},
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": inference_providers + [embedding_provider],
                    "vector_io": vector_io_providers,
                },
                default_models=default_models + [embedding_model],
                default_tool_groups=default_tool_groups,
                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
                storage_backends={
                    "kv_default": PostgresKVStoreConfig.sample_run_config(
                        table_name="llamastack_kvstore",
                    ),
                    "sql_default": PostgresSqlStoreConfig.sample_run_config(),
                },
            ),
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
                "8321",
                "Port for the Llama Stack distribution server",
            ),
        },
    )
--- a/src/llama_stack/distributions/starter-gpu/build.yaml
+++ b/src/llama_stack/distributions/starter-gpu/build.yaml
@ -57,4 +57,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
 - psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@ -0,0 +1,284 @@
 version: 2
 image_name: starter-gpu
 apis:
 - agents
 - batches
 - datasetio
 - eval
 - files
 - inference
 - post_training
 - safety
 - scoring
 - tool_runtime
 - vector_io
 providers:
  inference:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
      base_url: https://api.cerebras.ai
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:=http://localhost:11434}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
      url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
      url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
    config:
      api_key: ${env.AWS_BEDROCK_API_KEY:=}
      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
      api_key: ${env.OPENAI_API_KEY:=}
      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
  - provider_id: anthropic
    provider_type: remote::anthropic
    config:
      api_key: ${env.ANTHROPIC_API_KEY:=}
  - provider_id: gemini
    provider_type: remote::gemini
    config:
      api_key: ${env.GEMINI_API_KEY:=}
  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
    provider_type: remote::vertexai
    config:
      project: ${env.VERTEX_AI_PROJECT:=}
      location: ${env.VERTEX_AI_LOCATION:=us-central1}
  - provider_id: groq
    provider_type: remote::groq
    config:
      url: https://api.groq.com
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
      url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
      api_base: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      persistence:
        namespace: vector_io::faiss
        backend: kv_default
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
      persistence:
        namespace: vector_io::sqlite_vec
        backend: kv_default
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
      persistence:
        namespace: vector_io::milvus
        backend: kv_default
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      persistence:
        namespace: vector_io::chroma_remote
        backend: kv_default
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
      host: ${env.PGVECTOR_HOST:=localhost}
      port: ${env.PGVECTOR_PORT:=5432}
      db: ${env.PGVECTOR_DB:=}
      user: ${env.PGVECTOR_USER:=}
      password: ${env.PGVECTOR_PASSWORD:=}
      persistence:
        namespace: vector_io::pgvector
        backend: kv_default
  - provider_id: ${env.QDRANT_URL:+qdrant}
    provider_type: remote::qdrant
    config:
      api_key: ${env.QDRANT_API_KEY:=}
      persistence:
        namespace: vector_io::qdrant_remote
        backend: kv_default
  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
    provider_type: remote::weaviate
    config:
      weaviate_api_key: null
      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
      persistence:
        namespace: vector_io::weaviate
        backend: kv_default
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  - provider_id: code-scanner
    provider_type: inline::code-scanner
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sql_postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
      responses_store:
        type: sql_postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  post_training:
  - provider_id: huggingface-gpu
    provider_type: inline::huggingface-gpu
    config:
      checkpoint_format: huggingface
      distributed_backend: null
      device: cpu
      dpo_output_dir: ~/.llama/distributions/starter-gpu/dpo_output
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      kvstore:
        namespace: eval
        backend: kv_default
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config:
      kvstore:
        namespace: datasetio::huggingface
        backend: kv_default
  - provider_id: localfs
    provider_type: inline::localfs
    config:
      kvstore:
        namespace: datasetio::localfs
        backend: kv_default
  scoring:
  - provider_id: basic
    provider_type: inline::basic
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:=}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
    config:
      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: tavily-search
    provider_type: remote::tavily-search
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
  batches:
  - provider_id: reference
    provider_type: inline::reference
    config:
      kvstore:
        namespace: batches
        backend: kv_postgres
 storage:
  backends:
    kv_postgres:
      type: kv_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
    sql_postgres:
      type: sql_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
  stores:
    metadata:
      namespace: registry
      backend: kv_postgres
    inference:
      table_name: inference_store
      backend: sql_postgres
      max_write_queue_size: 10000
      num_writers: 4
    conversations:
      table_name: openai_conversations
      backend: sql_postgres
    prompts:
      namespace: prompts
      backend: kv_postgres
 registered_resources:
  models: []
  shields: []
  vector_dbs: []
  datasets: []
  scoring_fns: []
  benchmarks: []
  tool_groups: []
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/starter-gpu/run.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run.yaml
@ -46,6 +46,9 @@ providers:
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
    config:
      api_key: ${env.AWS_BEDROCK_API_KEY:=}
      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
--- a/src/llama_stack/distributions/starter/build.yaml
+++ b/src/llama_stack/distributions/starter/build.yaml
@ -57,4 +57,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
 - psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@ -0,0 +1,281 @@
 version: 2
 image_name: starter
 apis:
 - agents
 - batches
 - datasetio
 - eval
 - files
 - inference
 - post_training
 - safety
 - scoring
 - tool_runtime
 - vector_io
 providers:
  inference:
  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
    provider_type: remote::cerebras
    config:
      base_url: https://api.cerebras.ai
      api_key: ${env.CEREBRAS_API_KEY:=}
  - provider_id: ${env.OLLAMA_URL:+ollama}
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:=http://localhost:11434}
  - provider_id: ${env.VLLM_URL:+vllm}
    provider_type: remote::vllm
    config:
      url: ${env.VLLM_URL:=}
      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
      api_token: ${env.VLLM_API_TOKEN:=fake}
      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
  - provider_id: ${env.TGI_URL:+tgi}
    provider_type: remote::tgi
    config:
      url: ${env.TGI_URL:=}
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference/v1
      api_key: ${env.FIREWORKS_API_KEY:=}
  - provider_id: together
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
    config:
      api_key: ${env.AWS_BEDROCK_API_KEY:=}
      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
      api_key: ${env.NVIDIA_API_KEY:=}
      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
  - provider_id: openai
    provider_type: remote::openai
    config:
      api_key: ${env.OPENAI_API_KEY:=}
      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
  - provider_id: anthropic
    provider_type: remote::anthropic
    config:
      api_key: ${env.ANTHROPIC_API_KEY:=}
  - provider_id: gemini
    provider_type: remote::gemini
    config:
      api_key: ${env.GEMINI_API_KEY:=}
  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
    provider_type: remote::vertexai
    config:
      project: ${env.VERTEX_AI_PROJECT:=}
      location: ${env.VERTEX_AI_LOCATION:=us-central1}
  - provider_id: groq
    provider_type: remote::groq
    config:
      url: https://api.groq.com
      api_key: ${env.GROQ_API_KEY:=}
  - provider_id: sambanova
    provider_type: remote::sambanova
    config:
      url: https://api.sambanova.ai/v1
      api_key: ${env.SAMBANOVA_API_KEY:=}
  - provider_id: ${env.AZURE_API_KEY:+azure}
    provider_type: remote::azure
    config:
      api_key: ${env.AZURE_API_KEY:=}
      api_base: ${env.AZURE_API_BASE:=}
      api_version: ${env.AZURE_API_VERSION:=}
      api_type: ${env.AZURE_API_TYPE:=}
  - provider_id: sentence-transformers
    provider_type: inline::sentence-transformers
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      persistence:
        namespace: vector_io::faiss
        backend: kv_default
  - provider_id: sqlite-vec
    provider_type: inline::sqlite-vec
    config:
      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
      persistence:
        namespace: vector_io::sqlite_vec
        backend: kv_default
  - provider_id: ${env.MILVUS_URL:+milvus}
    provider_type: inline::milvus
    config:
      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
      persistence:
        namespace: vector_io::milvus
        backend: kv_default
  - provider_id: ${env.CHROMADB_URL:+chromadb}
    provider_type: remote::chromadb
    config:
      url: ${env.CHROMADB_URL:=}
      persistence:
        namespace: vector_io::chroma_remote
        backend: kv_default
  - provider_id: ${env.PGVECTOR_DB:+pgvector}
    provider_type: remote::pgvector
    config:
      host: ${env.PGVECTOR_HOST:=localhost}
      port: ${env.PGVECTOR_PORT:=5432}
      db: ${env.PGVECTOR_DB:=}
      user: ${env.PGVECTOR_USER:=}
      password: ${env.PGVECTOR_PASSWORD:=}
      persistence:
        namespace: vector_io::pgvector
        backend: kv_default
  - provider_id: ${env.QDRANT_URL:+qdrant}
    provider_type: remote::qdrant
    config:
      api_key: ${env.QDRANT_API_KEY:=}
      persistence:
        namespace: vector_io::qdrant_remote
        backend: kv_default
  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
    provider_type: remote::weaviate
    config:
      weaviate_api_key: null
      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
      persistence:
        namespace: vector_io::weaviate
        backend: kv_default
  files:
  - provider_id: meta-reference-files
    provider_type: inline::localfs
    config:
      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
      metadata_store:
        table_name: files_metadata
        backend: sql_default
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config:
      excluded_categories: []
  - provider_id: code-scanner
    provider_type: inline::code-scanner
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sql_postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
      responses_store:
        type: sql_postgres
        host: ${env.POSTGRES_HOST:=localhost}
        port: ${env.POSTGRES_PORT:=5432}
        db: ${env.POSTGRES_DB:=llamastack}
        user: ${env.POSTGRES_USER:=llamastack}
        password: ${env.POSTGRES_PASSWORD:=llamastack}
  post_training:
  - provider_id: torchtune-cpu
    provider_type: inline::torchtune-cpu
    config:
      checkpoint_format: meta
  eval:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      kvstore:
        namespace: eval
        backend: kv_default
  datasetio:
  - provider_id: huggingface
    provider_type: remote::huggingface
    config:
      kvstore:
        namespace: datasetio::huggingface
        backend: kv_default
  - provider_id: localfs
    provider_type: inline::localfs
    config:
      kvstore:
        namespace: datasetio::localfs
        backend: kv_default
  scoring:
  - provider_id: basic
    provider_type: inline::basic
  - provider_id: llm-as-judge
    provider_type: inline::llm-as-judge
  - provider_id: braintrust
    provider_type: inline::braintrust
    config:
      openai_api_key: ${env.OPENAI_API_KEY:=}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
    config:
      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: tavily-search
    provider_type: remote::tavily-search
    config:
      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
      max_results: 3
  - provider_id: rag-runtime
    provider_type: inline::rag-runtime
  - provider_id: model-context-protocol
    provider_type: remote::model-context-protocol
  batches:
  - provider_id: reference
    provider_type: inline::reference
    config:
      kvstore:
        namespace: batches
        backend: kv_postgres
 storage:
  backends:
    kv_postgres:
      type: kv_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
    sql_postgres:
      type: sql_postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
  stores:
    metadata:
      namespace: registry
      backend: kv_postgres
    inference:
      table_name: inference_store
      backend: sql_postgres
      max_write_queue_size: 10000
      num_writers: 4
    conversations:
      table_name: openai_conversations
      backend: sql_postgres
    prompts:
      namespace: prompts
      backend: kv_postgres
 registered_resources:
  models: []
  shields: []
  vector_dbs: []
  datasets: []
  scoring_fns: []
  benchmarks: []
  tool_groups: []
 server:
  port: 8321
 telemetry:
  enabled: true
--- a/src/llama_stack/distributions/starter/run.yaml
+++ b/src/llama_stack/distributions/starter/run.yaml
@ -46,6 +46,9 @@ providers:
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
    config:
      api_key: ${env.AWS_BEDROCK_API_KEY:=}
      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@ -17,6 +17,11 @@ from llama_stack.core.datatypes import (
    ToolGroupInput,
    VectorStoresConfig,
 )
 from llama_stack.core.storage.datatypes import (
    InferenceStoreReference,
    KVStoreReference,
    SqlStoreReference,
 )
 from llama_stack.core.utils.dynamic import instantiate_class_type
 from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
 from llama_stack.providers.datatypes import RemoteProviderSpec
@ -36,6 +41,7 @@ from llama_stack.providers.remote.vector_io.pgvector.config import (
 )
 from llama_stack.providers.remote.vector_io.qdrant.config import QdrantVectorIOConfig
 from llama_stack.providers.remote.vector_io.weaviate.config import WeaviateVectorIOConfig
 from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
 from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
@ -181,6 +187,62 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
            provider_shield_id="${env.CODE_SCANNER_MODEL:=}",
        ),
    ]
    postgres_config = PostgresSqlStoreConfig.sample_run_config()
    default_overrides = {
        "inference": remote_inference_providers + [embedding_provider],
        "vector_io": [
            Provider(
                provider_id="faiss",
                provider_type="inline::faiss",
                config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
            ),
            Provider(
                provider_id="sqlite-vec",
                provider_type="inline::sqlite-vec",
                config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
            ),
            Provider(
                provider_id="${env.MILVUS_URL:+milvus}",
                provider_type="inline::milvus",
                config=MilvusVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
            ),
            Provider(
                provider_id="${env.CHROMADB_URL:+chromadb}",
                provider_type="remote::chromadb",
                config=ChromaVectorIOConfig.sample_run_config(
                    f"~/.llama/distributions/{name}/",
                    url="${env.CHROMADB_URL:=}",
                ),
            ),
            Provider(
                provider_id="${env.PGVECTOR_DB:+pgvector}",
                provider_type="remote::pgvector",
                config=PGVectorVectorIOConfig.sample_run_config(
                    f"~/.llama/distributions/{name}",
                    db="${env.PGVECTOR_DB:=}",
                    user="${env.PGVECTOR_USER:=}",
                    password="${env.PGVECTOR_PASSWORD:=}",
                ),
            ),
            Provider(
                provider_id="${env.QDRANT_URL:+qdrant}",
                provider_type="remote::qdrant",
                config=QdrantVectorIOConfig.sample_run_config(
                    f"~/.llama/distributions/{name}",
                    url="${env.QDRANT_URL:=}",
                ),
            ),
            Provider(
                provider_id="${env.WEAVIATE_CLUSTER_URL:+weaviate}",
                provider_type="remote::weaviate",
                config=WeaviateVectorIOConfig.sample_run_config(
                    f"~/.llama/distributions/{name}",
                    cluster_url="${env.WEAVIATE_CLUSTER_URL:=}",
                ),
            ),
        ],
        "files": [files_provider],
    }
    return DistributionTemplate(
        name=name,
@ -189,64 +251,10 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
        container_image=None,
        template_path=None,
        providers=providers,
-        additional_pip_packages=PostgresSqlStoreConfig.pip_packages(),
+        additional_pip_packages=list(set(PostgresSqlStoreConfig.pip_packages() + PostgresKVStoreConfig.pip_packages())),
        run_configs={
            "run.yaml": RunConfigSettings(
-                provider_overrides={
+                provider_overrides=default_overrides,
                    "inference": remote_inference_providers + [embedding_provider],
                    "vector_io": [
                        Provider(
                            provider_id="faiss",
                            provider_type="inline::faiss",
                            config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
                        ),
                        Provider(
                            provider_id="sqlite-vec",
                            provider_type="inline::sqlite-vec",
                            config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
                        ),
                        Provider(
                            provider_id="${env.MILVUS_URL:+milvus}",
                            provider_type="inline::milvus",
                            config=MilvusVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
                        ),
                        Provider(
                            provider_id="${env.CHROMADB_URL:+chromadb}",
                            provider_type="remote::chromadb",
                            config=ChromaVectorIOConfig.sample_run_config(
                                f"~/.llama/distributions/{name}/",
                                url="${env.CHROMADB_URL:=}",
                            ),
                        ),
                        Provider(
                            provider_id="${env.PGVECTOR_DB:+pgvector}",
                            provider_type="remote::pgvector",
                            config=PGVectorVectorIOConfig.sample_run_config(
                                f"~/.llama/distributions/{name}",
                                db="${env.PGVECTOR_DB:=}",
                                user="${env.PGVECTOR_USER:=}",
                                password="${env.PGVECTOR_PASSWORD:=}",
                            ),
                        ),
                        Provider(
                            provider_id="${env.QDRANT_URL:+qdrant}",
                            provider_type="remote::qdrant",
                            config=QdrantVectorIOConfig.sample_run_config(
                                f"~/.llama/distributions/{name}",
                                url="${env.QDRANT_URL:=}",
                            ),
                        ),
                        Provider(
                            provider_id="${env.WEAVIATE_CLUSTER_URL:+weaviate}",
                            provider_type="remote::weaviate",
                            config=WeaviateVectorIOConfig.sample_run_config(
                                f"~/.llama/distributions/{name}",
                                cluster_url="${env.WEAVIATE_CLUSTER_URL:=}",
                            ),
                        ),
                    ],
                    "files": [files_provider],
                },
                default_models=[],
                default_tool_groups=default_tool_groups,
                default_shields=default_shields,
@ -261,6 +269,55 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
                    default_shield_id="llama-guard",
                ),
            ),
            "run-with-postgres-store.yaml": RunConfigSettings(
                provider_overrides={
                    **default_overrides,
                    "agents": [
                        Provider(
                            provider_id="meta-reference",
                            provider_type="inline::meta-reference",
                            config=dict(
                                persistence_store=postgres_config,
                                responses_store=postgres_config,
                            ),
                        )
                    ],
                    "batches": [
                        Provider(
                            provider_id="reference",
                            provider_type="inline::reference",
                            config=dict(
                                kvstore=KVStoreReference(
                                    backend="kv_postgres",
                                    namespace="batches",
                                ).model_dump(exclude_none=True),
                            ),
                        )
                    ],
                },
                storage_backends={
                    "kv_postgres": PostgresKVStoreConfig.sample_run_config(),
                    "sql_postgres": postgres_config,
                },
                storage_stores={
                    "metadata": KVStoreReference(
                        backend="kv_postgres",
                        namespace="registry",
                    ).model_dump(exclude_none=True),
                    "inference": InferenceStoreReference(
                        backend="sql_postgres",
                        table_name="inference_store",
                    ).model_dump(exclude_none=True),
                    "conversations": SqlStoreReference(
                        backend="sql_postgres",
                        table_name="openai_conversations",
                    ).model_dump(exclude_none=True),
                    "prompts": KVStoreReference(
                        backend="kv_postgres",
                        namespace="prompts",
                    ).model_dump(exclude_none=True),
                },
            ),
        },
        run_config_env_vars={
            "LLAMA_STACK_PORT": (
--- a/src/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/src/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -146,7 +146,7 @@ class MetaReferenceInferenceImpl(
    def check_model(self, request) -> None:
        if self.model_id is None or self.llama_model is None:
            raise RuntimeError(
-                "No avaible model yet, please register your requested model or add your model in the resouces first"
+                "No available model yet, please register your requested model or add your model in the resources first"
            )
        elif request.model != self.model_id:
            raise RuntimeError(f"Model mismatch: request model: {request.model} != loaded model: {self.model_id}")
--- a/src/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
+++ b/src/llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py
@ -91,7 +91,7 @@ class TorchtuneCheckpointer:
        if checkpoint_format == "meta" or checkpoint_format is None:
            self._save_meta_format_checkpoint(model_file_path, state_dict, adapter_only)
        elif checkpoint_format == "huggingface":
-            # Note: for saving hugging face format checkpoints, we only suppport saving adapter weights now
+            # Note: for saving hugging face format checkpoints, we only support saving adapter weights now
            self._save_hf_format_checkpoint(model_file_path, state_dict)
        else:
            raise ValueError(f"Unsupported checkpoint format: {format}")
--- a/src/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
+++ b/src/llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py
@ -25,7 +25,7 @@ def llama_stack_instruct_to_torchtune_instruct(
    )
    input_messages = json.loads(sample[ColumnName.chat_completion_input.value])
-    assert len(input_messages) == 1, "llama stack intruct dataset format only supports 1 user message"
+    assert len(input_messages) == 1, "llama stack instruct dataset format only supports 1 user message"
    input_message = input_messages[0]
    assert "content" in input_message, "content not found in input message"
--- a/src/llama_stack/providers/registry/inference.py
+++ b/src/llama_stack/providers/registry/inference.py
@ -138,10 +138,11 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.inference,
            adapter_type="bedrock",
            provider_type="remote::bedrock",
-            pip_packages=["boto3"],
+            pip_packages=[],
            module="llama_stack.providers.remote.inference.bedrock",
            config_class="llama_stack.providers.remote.inference.bedrock.BedrockConfig",
-            description="AWS Bedrock inference provider for accessing various AI models through AWS's managed service.",
+            provider_data_validator="llama_stack.providers.remote.inference.bedrock.config.BedrockProviderDataValidator",
            description="AWS Bedrock inference provider using OpenAI compatible endpoint.",
        ),
        RemoteProviderSpec(
            api=Api.inference,
--- a/src/llama_stack/providers/remote/datasetio/nvidia/README.md
+++ b/src/llama_stack/providers/remote/datasetio/nvidia/README.md
@ -20,6 +20,7 @@ This provider enables dataset management using NVIDIA's NeMo Customizer service.
 Build the NVIDIA environment:
 ```bash
 uv pip install llama-stack-client
 uv run llama stack list-deps nvidia | xargs -L1 uv pip install
 ```
--- a/src/llama_stack/providers/remote/inference/bedrock/init.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/init.py
@ -11,7 +11,7 @@ async def get_adapter_impl(config: BedrockConfig, _deps):
    assert isinstance(config, BedrockConfig), f"Unexpected config type: {type(config)}"
-    impl = BedrockInferenceAdapter(config)
+    impl = BedrockInferenceAdapter(config=config)
    await impl.initialize()
--- a/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/bedrock.py
@ -4,139 +4,124 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-import json
+from collections.abc import AsyncIterator, Iterable
 from collections.abc import AsyncIterator
-from botocore.client import BaseClient
+from openai import AuthenticationError
 from llama_stack.apis.inference import (
-    ChatCompletionRequest,
+    OpenAIChatCompletion,
-    Inference,
+    OpenAIChatCompletionChunk,
    OpenAIChatCompletionRequestWithExtraBody,
    OpenAICompletion,
    OpenAICompletionRequestWithExtraBody,
    OpenAIEmbeddingsRequestWithExtraBody,
    OpenAIEmbeddingsResponse,
 )
-from llama_stack.apis.inference.inference import (
+from llama_stack.core.telemetry.tracing import get_current_span
-    OpenAIChatCompletion,
+from llama_stack.log import get_logger
-    OpenAIChatCompletionChunk,
+from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
    OpenAICompletion,
 )
 from llama_stack.providers.remote.inference.bedrock.config import BedrockConfig
 from llama_stack.providers.utils.bedrock.client import create_bedrock_client
 from llama_stack.providers.utils.inference.model_registry import (
    ModelRegistryHelper,
 )
 from llama_stack.providers.utils.inference.openai_compat import (
    get_sampling_strategy_options,
 )
 from llama_stack.providers.utils.inference.prompt_adapter import (
    chat_completion_request_to_prompt,
 )
-from .models import MODEL_ENTRIES
+from .config import BedrockConfig
-REGION_PREFIX_MAP = {
+logger = get_logger(name=__name__, category="inference::bedrock")
    "us": "us.",
    "eu": "eu.",
    "ap": "ap.",
 }
-def _get_region_prefix(region: str | None) -> str:
+class BedrockInferenceAdapter(OpenAIMixin):
-    # AWS requires region prefixes for inference profiles
+    """
-    if region is None:
+    Adapter for AWS Bedrock's OpenAI-compatible API endpoints.
        return "us."  # default to US when we don't know
-    # Handle case insensitive region matching
+    Supports Llama models across regions and GPT-OSS models (us-west-2 only).
    region_lower = region.lower()
    for prefix in REGION_PREFIX_MAP:
        if region_lower.startswith(f"{prefix}-"):
            return REGION_PREFIX_MAP[prefix]
-    # Fallback to US for anything we don't recognize
+    Note: Bedrock's OpenAI-compatible endpoint does not support /v1/models
-    return "us."
+    for dynamic model discovery. Models must be pre-registered in the config.
    """
    config: BedrockConfig
    provider_data_api_key_field: str = "aws_bedrock_api_key"
-def _to_inference_profile_id(model_id: str, region: str = None) -> str:
+    def get_base_url(self) -> str:
-    # Return ARNs unchanged
+        """Get base URL for OpenAI client."""
-    if model_id.startswith("arn:"):
+        return f"https://bedrock-runtime.{self.config.region_name}.amazonaws.com/openai/v1"
        return model_id
-    # Return inference profile IDs that already have regional prefixes
+    async def list_provider_model_ids(self) -> Iterable[str]:
-    if any(model_id.startswith(p) for p in REGION_PREFIX_MAP.values()):
+        """
-        return model_id
+        Bedrock's OpenAI-compatible endpoint does not support the /v1/models endpoint.
        Returns empty list since models must be pre-registered in the config.
        """
        return []
-    # Default to US East when no region is provided
+    async def check_model_availability(self, model: str) -> bool:
-    if region is None:
+        """
-        region = "us-east-1"
+        Bedrock doesn't support dynamic model listing via /v1/models.
-
+        Always return True to accept all models registered in the config.
-    return _get_region_prefix(region) + model_id
+        """
-
+        return True
 class BedrockInferenceAdapter(
    ModelRegistryHelper,
    Inference,
 ):
    def __init__(self, config: BedrockConfig) -> None:
        ModelRegistryHelper.__init__(self, model_entries=MODEL_ENTRIES)
        self._config = config
        self._client = None
    @property
    def client(self) -> BaseClient:
        if self._client is None:
            self._client = create_bedrock_client(self._config)
        return self._client
    async def initialize(self) -> None:
        pass
    async def shutdown(self) -> None:
        if self._client is not None:
            self._client.close()
    async def _get_params_for_chat_completion(self, request: ChatCompletionRequest) -> dict:
        bedrock_model = request.model
        sampling_params = request.sampling_params
        options = get_sampling_strategy_options(sampling_params)
        if sampling_params.max_tokens:
            options["max_gen_len"] = sampling_params.max_tokens
        if sampling_params.repetition_penalty > 0:
            options["repetition_penalty"] = sampling_params.repetition_penalty
        prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model))
        # Convert foundation model ID to inference profile ID
        region_name = self.client.meta.region_name
        inference_profile_id = _to_inference_profile_id(bedrock_model, region_name)
        return {
            "modelId": inference_profile_id,
            "body": json.dumps(
                {
                    "prompt": prompt,
                    **options,
                }
            ),
        }
    async def openai_embeddings(
        self,
        params: OpenAIEmbeddingsRequestWithExtraBody,
    ) -> OpenAIEmbeddingsResponse:
-        raise NotImplementedError()
+        """Bedrock's OpenAI-compatible API does not support the /v1/embeddings endpoint."""
        raise NotImplementedError(
            "Bedrock's OpenAI-compatible API does not support /v1/embeddings endpoint. "
            "See https://docs.aws.amazon.com/bedrock/latest/userguide/inference-chat-completions.html"
        )
    async def openai_completion(
        self,
        params: OpenAICompletionRequestWithExtraBody,
    ) -> OpenAICompletion:
-        raise NotImplementedError("OpenAI completion not supported by the Bedrock provider")
+        """Bedrock's OpenAI-compatible API does not support the /v1/completions endpoint."""
        raise NotImplementedError(
            "Bedrock's OpenAI-compatible API does not support /v1/completions endpoint. "
            "Only /v1/chat/completions is supported. "
            "See https://docs.aws.amazon.com/bedrock/latest/userguide/inference-chat-completions.html"
        )
    async def openai_chat_completion(
        self,
        params: OpenAIChatCompletionRequestWithExtraBody,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        raise NotImplementedError("OpenAI chat completion not supported by the Bedrock provider")
+        """Override to enable streaming usage metrics and handle authentication errors."""
        # Enable streaming usage metrics when telemetry is active
        if params.stream and get_current_span() is not None:
            if params.stream_options is None:
                params.stream_options = {"include_usage": True}
            elif "include_usage" not in params.stream_options:
                params.stream_options = {**params.stream_options, "include_usage": True}
        try:
            logger.debug(f"Calling Bedrock OpenAI API with model={params.model}, stream={params.stream}")
            result = await super().openai_chat_completion(params=params)
            logger.debug(f"Bedrock API returned: {type(result).__name__ if result is not None else 'None'}")
            if result is None:
                logger.error(f"Bedrock OpenAI client returned None for model={params.model}, stream={params.stream}")
                raise RuntimeError(
                    f"Bedrock API returned no response for model '{params.model}'. "
                    "This may indicate the model is not supported or a network/API issue occurred."
                )
            return result
        except AuthenticationError as e:
            error_msg = str(e)
            # Check if this is a token expiration error
            if "expired" in error_msg.lower() or "Bearer Token has expired" in error_msg:
                logger.error(f"AWS Bedrock authentication token expired: {error_msg}")
                raise ValueError(
                    "AWS Bedrock authentication failed: Bearer token has expired. "
                    "The AWS_BEDROCK_API_KEY environment variable contains an expired pre-signed URL. "
                    "Please refresh your token by generating a new pre-signed URL with AWS credentials. "
                    "Refer to AWS Bedrock documentation for details on OpenAI-compatible endpoints."
                ) from e
            else:
                logger.error(f"AWS Bedrock authentication failed: {error_msg}")
                raise ValueError(
                    f"AWS Bedrock authentication failed: {error_msg}. "
                    "Please verify your API key is correct in the provider config or x-llamastack-provider-data header. "
                    "The API key should be a valid AWS pre-signed URL for Bedrock's OpenAI-compatible endpoint."
                ) from e
        except Exception as e:
            logger.error(f"Unexpected error calling Bedrock API: {type(e).__name__}: {e}", exc_info=True)
            raise
--- a/src/llama_stack/providers/remote/inference/bedrock/config.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/config.py
@ -4,8 +4,29 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
+import os
 from pydantic import BaseModel, Field
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
-class BedrockConfig(BedrockBaseConfig):
+class BedrockProviderDataValidator(BaseModel):
-    pass
+    aws_bedrock_api_key: str | None = Field(
        default=None,
        description="API key for Amazon Bedrock",
    )
 class BedrockConfig(RemoteInferenceProviderConfig):
    region_name: str = Field(
        default_factory=lambda: os.getenv("AWS_DEFAULT_REGION", "us-east-2"),
        description="AWS Region for the Bedrock Runtime endpoint",
    )
    @classmethod
    def sample_run_config(cls, **kwargs):
        return {
            "api_key": "${env.AWS_BEDROCK_API_KEY:=}",
            "region_name": "${env.AWS_DEFAULT_REGION:=us-east-2}",
        }
--- a/src/llama_stack/providers/remote/inference/bedrock/models.py
+++ b/src/llama_stack/providers/remote/inference/bedrock/models.py
@ -1,29 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from llama_stack.models.llama.sku_types import CoreModelId
 from llama_stack.providers.utils.inference.model_registry import (
    build_hf_repo_model_entry,
 )
 SAFETY_MODELS_ENTRIES = []
 # https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
 MODEL_ENTRIES = [
    build_hf_repo_model_entry(
        "meta.llama3-1-8b-instruct-v1:0",
        CoreModelId.llama3_1_8b_instruct.value,
    ),
    build_hf_repo_model_entry(
        "meta.llama3-1-70b-instruct-v1:0",
        CoreModelId.llama3_1_70b_instruct.value,
    ),
    build_hf_repo_model_entry(
        "meta.llama3-1-405b-instruct-v1:0",
        CoreModelId.llama3_1_405b_instruct.value,
    ),
 ] + SAFETY_MODELS_ENTRIES
--- a/src/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
+++ b/src/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@ -18,6 +18,7 @@ This provider enables running inference using NVIDIA NIM.
 Build the NVIDIA environment:
 ```bash
 uv pip install llama-stack-client
 uv run llama stack list-deps nvidia | xargs -L1 uv pip install
 ```
@ -199,4 +200,4 @@ rerank_response = client.alpha.inference.rerank(
 for i, result in enumerate(rerank_response):
    print(f"{i+1}. [Index: {result.index}, " f"Score: {(result.relevance_score):.3f}]")
-```
+```
--- a/src/llama_stack/providers/remote/inference/passthrough/init.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/init.py
@ -10,8 +10,8 @@ from .config import PassthroughImplConfig
 class PassthroughProviderDataValidator(BaseModel):
-    url: str
+    passthrough_url: str
-    api_key: str
+    passthrough_api_key: str
 async def get_adapter_impl(config: PassthroughImplConfig, _deps):
--- a/src/llama_stack/providers/remote/inference/passthrough/config.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/config.py
@ -6,7 +6,7 @@
 from typing import Any
-from pydantic import Field, SecretStr
+from pydantic import Field
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack.schema_utils import json_schema_type
@ -19,11 +19,6 @@ class PassthroughImplConfig(RemoteInferenceProviderConfig):
        description="The URL for the passthrough endpoint",
    )
    api_key: SecretStr | None = Field(
        default=None,
        description="API Key for the passthrouth endpoint",
    )
    @classmethod
    def sample_run_config(
        cls, url: str = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs
--- a/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
+++ b/src/llama_stack/providers/remote/inference/passthrough/passthrough.py
@ -5,9 +5,8 @@
 # the root directory of this source tree.
 from collections.abc import AsyncIterator
 from typing import Any
-from llama_stack_client import AsyncLlamaStackClient
+from openai import AsyncOpenAI
 from llama_stack.apis.inference import (
    Inference,
@ -20,103 +19,117 @@ from llama_stack.apis.inference import (
    OpenAIEmbeddingsResponse,
 )
 from llama_stack.apis.models import Model
-from llama_stack.core.library_client import convert_pydantic_to_json_value
+from llama_stack.core.request_headers import NeedsRequestProviderData
 from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
 from .config import PassthroughImplConfig
-class PassthroughInferenceAdapter(Inference):
+class PassthroughInferenceAdapter(NeedsRequestProviderData, Inference):
    def __init__(self, config: PassthroughImplConfig) -> None:
        ModelRegistryHelper.__init__(self)
        self.config = config
    async def initialize(self) -> None:
        pass
    async def shutdown(self) -> None:
        pass
    async def unregister_model(self, model_id: str) -> None:
        pass
    async def register_model(self, model: Model) -> Model:
        return model
-    def _get_client(self) -> AsyncLlamaStackClient:
+    async def list_models(self) -> list[Model]:
-        passthrough_url = None
+        """List models by calling the downstream /v1/models endpoint."""
-        passthrough_api_key = None
+        client = self._get_openai_client()
        provider_data = None
-        if self.config.url is not None:
+        response = await client.models.list()
            passthrough_url = self.config.url
        else:
            provider_data = self.get_request_provider_data()
            if provider_data is None or not provider_data.passthrough_url:
                raise ValueError(
                    'Pass url of the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_url": <your passthrough url>}'
                )
            passthrough_url = provider_data.passthrough_url
-        if self.config.api_key is not None:
+        # Convert from OpenAI format to Llama Stack Model format
-            passthrough_api_key = self.config.api_key.get_secret_value()
+        models = []
-        else:
+        for model_data in response.data:
-            provider_data = self.get_request_provider_data()
+            downstream_model_id = model_data.id
-            if provider_data is None or not provider_data.passthrough_api_key:
+            custom_metadata = getattr(model_data, "custom_metadata", {}) or {}
                raise ValueError(
                    'Pass API Key for the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_api_key": <your api key>}'
                )
            passthrough_api_key = provider_data.passthrough_api_key
-        return AsyncLlamaStackClient(
+            # Prefix identifier with provider ID for local registry
-            base_url=passthrough_url,
+            local_identifier = f"{self.__provider_id__}/{downstream_model_id}"
-            api_key=passthrough_api_key,
+
-            provider_data=provider_data,
+            model = Model(
                identifier=local_identifier,
                provider_id=self.__provider_id__,
                provider_resource_id=downstream_model_id,
                model_type=custom_metadata.get("model_type", "llm"),
                metadata=custom_metadata,
            )
            models.append(model)
        return models
    async def should_refresh_models(self) -> bool:
        """Passthrough should refresh models since they come from downstream dynamically."""
        return self.config.refresh_models
    def _get_openai_client(self) -> AsyncOpenAI:
        """Get an AsyncOpenAI client configured for the downstream server."""
        base_url = self._get_passthrough_url()
        api_key = self._get_passthrough_api_key()
        return AsyncOpenAI(
            base_url=f"{base_url.rstrip('/')}/v1",
            api_key=api_key,
        )
-    async def openai_embeddings(
+    def _get_passthrough_url(self) -> str:
-        self,
+        """Get the passthrough URL from config or provider data."""
-        params: OpenAIEmbeddingsRequestWithExtraBody,
+        if self.config.url is not None:
-    ) -> OpenAIEmbeddingsResponse:
+            return self.config.url
-        raise NotImplementedError()
+
        provider_data = self.get_request_provider_data()
        if provider_data is None:
            raise ValueError(
                'Pass url of the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_url": <your passthrough url>}'
            )
        return provider_data.passthrough_url
    def _get_passthrough_api_key(self) -> str:
        """Get the passthrough API key from config or provider data."""
        if self.config.auth_credential is not None:
            return self.config.auth_credential.get_secret_value()
        provider_data = self.get_request_provider_data()
        if provider_data is None:
            raise ValueError(
                'Pass API Key for the passthrough endpoint in the header X-LlamaStack-Provider-Data as { "passthrough_api_key": <your api key>}'
            )
        return provider_data.passthrough_api_key
    async def openai_completion(
        self,
        params: OpenAICompletionRequestWithExtraBody,
    ) -> OpenAICompletion:
-        client = self._get_client()
+        """Forward completion request to downstream using OpenAI client."""
-        model_obj = await self.model_store.get_model(params.model)
+        client = self._get_openai_client()
        params = params.model_copy()
        params.model = model_obj.provider_resource_id
        request_params = params.model_dump(exclude_none=True)
-
+        response = await client.completions.create(**request_params)
-        return await client.inference.openai_completion(**request_params)
+        return response  # type: ignore
    async def openai_chat_completion(
        self,
        params: OpenAIChatCompletionRequestWithExtraBody,
    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        client = self._get_client()
+        """Forward chat completion request to downstream using OpenAI client."""
-        model_obj = await self.model_store.get_model(params.model)
+        client = self._get_openai_client()
        params = params.model_copy()
        params.model = model_obj.provider_resource_id
        request_params = params.model_dump(exclude_none=True)
        response = await client.chat.completions.create(**request_params)
        return response  # type: ignore
-        return await client.inference.openai_chat_completion(**request_params)
+    async def openai_embeddings(
-
+        self,
-    def cast_value_to_json_dict(self, request_params: dict[str, Any]) -> dict[str, Any]:
+        params: OpenAIEmbeddingsRequestWithExtraBody,
-        json_params = {}
+    ) -> OpenAIEmbeddingsResponse:
-        for key, value in request_params.items():
+        """Forward embeddings request to downstream using OpenAI client."""
-            json_input = convert_pydantic_to_json_value(value)
+        client = self._get_openai_client()
-            if isinstance(json_input, dict):
+        request_params = params.model_dump(exclude_none=True)
-                json_input = {k: v for k, v in json_input.items() if v is not None}
+        response = await client.embeddings.create(**request_params)
-            elif isinstance(json_input, list):
+        return response  # type: ignore
                json_input = [x for x in json_input if x is not None]
                new_input = []
                for x in json_input:
                    if isinstance(x, dict):
                        x = {k: v for k, v in x.items() if v is not None}
                    new_input.append(x)
                json_input = new_input
            json_params[key] = json_input
        return json_params
--- a/src/llama_stack/providers/remote/post_training/nvidia/README.md
+++ b/src/llama_stack/providers/remote/post_training/nvidia/README.md
@ -22,6 +22,7 @@ This provider enables fine-tuning of LLMs using NVIDIA's NeMo Customizer service
 Build the NVIDIA environment:
 ```bash
 uv pip install llama-stack-client
 uv run llama stack list-deps nvidia | xargs -L1 uv pip install
 ```
--- a/src/llama_stack/providers/remote/safety/nvidia/README.md
+++ b/src/llama_stack/providers/remote/safety/nvidia/README.md
@ -19,6 +19,7 @@ This provider enables safety checks and guardrails for LLM interactions using NV
 Build the NVIDIA environment:
 ```bash
 uv pip install llama-stack-client
 uv run llama stack list-deps nvidia | xargs -L1 uv pip install
 ```
--- a/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
+++ b/src/llama_stack/providers/utils/memory/openai_vector_store_mixin.py
@ -26,6 +26,7 @@ from llama_stack.apis.vector_io import (
    VectorStoreChunkingStrategy,
    VectorStoreChunkingStrategyAuto,
    VectorStoreChunkingStrategyStatic,
    VectorStoreChunkingStrategyStaticConfig,
    VectorStoreContent,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
@ -414,6 +415,10 @@ class OpenAIVectorStoreMixin(ABC):
            in_progress=0,
            total=0,
        )
        if not params.chunking_strategy or params.chunking_strategy.type == "auto":
            chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
        else:
            chunking_strategy = params.chunking_strategy
        store_info: dict[str, Any] = {
            "id": vector_store_id,
            "object": "vector_store",
@ -426,7 +431,7 @@ class OpenAIVectorStoreMixin(ABC):
            "expires_at": None,
            "last_active_at": created_at,
            "file_ids": [],
-            "chunking_strategy": params.chunking_strategy,
+            "chunking_strategy": chunking_strategy.model_dump(),
        }
        # Add provider information to metadata if provided
@ -637,7 +642,7 @@ class OpenAIVectorStoreMixin(ABC):
                    break
            return VectorStoreSearchResponsePage(
-                search_query=search_query,
+                search_query=query if isinstance(query, list) else [query],
                data=data,
                has_more=False,  # For simplicity, we don't implement pagination here
                next_page=None,
@ -647,7 +652,7 @@ class OpenAIVectorStoreMixin(ABC):
            logger.error(f"Error searching vector store {vector_store_id}: {e}")
            # Return empty results on error
            return VectorStoreSearchResponsePage(
-                search_query=search_query,
+                search_query=query if isinstance(query, list) else [query],
                data=[],
                has_more=False,
                next_page=None,
--- a/tests/integration/agents/init.py
+++ b/tests/integration/agents/init.py
--- a/tests/integration/agents/recordings/007a9180a7aa38e17c1135ebf1f75e0d5ce1ea58e2261deba8c41e51196078ec.json
+++ b/tests/integration/agents/recordings/007a9180a7aa38e17c1135ebf1f75e0d5ce1ea58e2261deba8c41e51196078ec.json
@ -0,0 +1,59 @@
 {
  "test_id": "tests/integration/agents/test_openai_responses.py::test_output_safety_guardrails_unsafe_content[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-False]",
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama-guard3:1b",
      "messages": [
        {
          "role": "user",
          "content": "Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: Violent content refers to materials, such as films, television shows, video games, or literature, that depict or glorify violence, aggression, or\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST User message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
        }
      ],
      "stream": false,
      "temperature": 0.0
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama-guard3:1b"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-007a9180a7aa",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "safe",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 0,
        "model": "llama-guard3:1b",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 2,
          "prompt_tokens": 414,
          "total_tokens": 416,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/agents/recordings/00bf38cb0b6eef2963c49f52798781840456635d0510be615cda65f93cd1cdfb.json
+++ b/tests/integration/agents/recordings/00bf38cb0b6eef2963c49f52798781840456635d0510be615cda65f93cd1cdfb.json
@ -0,0 +1,233 @@
 {
  "test_id": "tests/integration/agents/test_openai_responses.py::test_list_response_input_items[openai_client-txt=ollama/llama3.2:3b-instruct-fp16]",
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama3.2:3b-instruct-fp16",
      "messages": [
        {
          "role": "user",
          "content": "What is the capital of France?"
        }
      ],
      "stream": true
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama3.2:3b-instruct-fp16"
  },
  "response": {
    "body": [
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-00bf38cb0b6e",
          "choices": [
            {
              "delta": {
                "content": "The",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-00bf38cb0b6e",
          "choices": [
            {
              "delta": {
                "content": " capital",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-00bf38cb0b6e",
          "choices": [
            {
              "delta": {
                "content": " of",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-00bf38cb0b6e",
          "choices": [
            {
              "delta": {
                "content": " France",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-00bf38cb0b6e",
          "choices": [
            {
              "delta": {
                "content": " is",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-00bf38cb0b6e",
          "choices": [
            {
              "delta": {
                "content": " Paris",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-00bf38cb0b6e",
          "choices": [
            {
              "delta": {
                "content": ".",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": null,
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      },
      {
        "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
        "__data__": {
          "id": "rec-00bf38cb0b6e",
          "choices": [
            {
              "delta": {
                "content": "",
                "function_call": null,
                "refusal": null,
                "role": "assistant",
                "tool_calls": null
              },
              "finish_reason": "stop",
              "index": 0,
              "logprobs": null
            }
          ],
          "created": 0,
          "model": "llama3.2:3b-instruct-fp16",
          "object": "chat.completion.chunk",
          "service_tier": null,
          "system_fingerprint": "fp_ollama",
          "usage": null
        }
      }
    ],
    "is_streaming": true
  }
 }
--- a/tests/integration/agents/recordings/01175978d117633394f2fa36371296b78af269f38656a12fd35a6195efc45787.json
+++ b/tests/integration/agents/recordings/01175978d117633394f2fa36371296b78af269f38656a12fd35a6195efc45787.json
@ -0,0 +1,59 @@
 {
  "test_id": "tests/integration/agents/test_openai_responses.py::test_output_safety_guardrails_safe_content[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-True]",
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama-guard3:1b",
      "messages": [
        {
          "role": "user",
          "content": "Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: I don't have a personal name, but I'm an AI designed to assist and communicate with users in a helpful and informative way. You can think of me as a conversational robot or a digital assistant. If you'd like, I can also generate a nickname\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST User message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
        }
      ],
      "stream": false,
      "temperature": 0.0
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama-guard3:1b"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-01175978d117",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "safe",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 0,
        "model": "llama-guard3:1b",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 2,
          "prompt_tokens": 437,
          "total_tokens": 439,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/tests/integration/agents/recordings/01bf932b8a65a67fef755e75e11b3b0a3dd2150681781018d1dda3aba98650b2.json
+++ b/tests/integration/agents/recordings/01bf932b8a65a67fef755e75e11b3b0a3dd2150681781018d1dda3aba98650b2.json
@ -0,0 +1,59 @@
 {
  "test_id": "tests/integration/agents/test_openai_responses.py::test_output_safety_guardrails_unsafe_content[openai_client-txt=ollama/llama3.2:3b-instruct-fp16-False]",
  "request": {
    "method": "POST",
    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
    "headers": {},
    "body": {
      "model": "llama-guard3:1b",
      "messages": [
        {
          "role": "user",
          "content": "Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: Violent content refers to media, such as films, television shows, video games, and literature, that depict graphic violence, gore, or intensity of conflict. This type of content often includes scenes of violence\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST User message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
        }
      ],
      "stream": false,
      "temperature": 0.0
    },
    "endpoint": "/v1/chat/completions",
    "model": "llama-guard3:1b"
  },
  "response": {
    "body": {
      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
      "__data__": {
        "id": "rec-01bf932b8a65",
        "choices": [
          {
            "finish_reason": "stop",
            "index": 0,
            "logprobs": null,
            "message": {
              "content": "safe",
              "refusal": null,
              "role": "assistant",
              "annotations": null,
              "audio": null,
              "function_call": null,
              "tool_calls": null
            }
          }
        ],
        "created": 0,
        "model": "llama-guard3:1b",
        "object": "chat.completion",
        "service_tier": null,
        "system_fingerprint": "fp_ollama",
        "usage": {
          "completion_tokens": 2,
          "prompt_tokens": 425,
          "total_tokens": 427,
          "completion_tokens_details": null,
          "prompt_tokens_details": null
        }
      }
    },
    "is_streaming": false
  },
  "id_normalization_mapping": {}
 }
--- a/Show more
+++ b/Show more