Merge branch 'main' into fix-vector

2025-12-03 01:48:05 +00:00 · 2025-11-08 23:43:14 -05:00 · 2025-11-08 23:43:14 -05:00 · 33950adaf2
commit 33950adaf2
parent ef16d4cadb 8f4c431370
1730 changed files with 121537 additions and 382737 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -5,7 +5,7 @@ omit =
    */llama_stack/templates/*
    .venv/*
    */llama_stack/cli/scripts/*
-    */llama_stack/ui/*
+    */llama_stack_ui/*
    */llama_stack/distribution/ui/*
    */llama_stack/strong_typing/*
    */llama_stack/env.py
--- a/.github/actions/run-and-record-tests/action.yml
+++ b/.github/actions/run-and-record-tests/action.yml
@ -72,7 +72,8 @@ runs:
          echo "New recordings detected, committing and pushing"
          git add tests/integration/

-          git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
+          git commit -m "Recordings update from CI (setup: ${{ inputs.setup }}, suite: ${{ inputs.suite }})"
+
          git fetch origin ${{ github.ref_name }}
          git rebase origin/${{ github.ref_name }}
          echo "Rebased successfully"
@ -88,6 +89,8 @@ runs:
      run: |
        # Ollama logs (if ollama container exists)
        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
+        # vllm logs (if vllm container exists)
+        sudo docker logs vllm > vllm-${{ inputs.inference-mode }}.log 2>&1 || true
        # Note: distro container logs are now dumped in integration-tests.sh before container is removed

    - name: Upload logs
--- a/.github/actions/setup-vllm/action.yml
+++ b/.github/actions/setup-vllm/action.yml
@ -11,13 +11,14 @@ runs:
          --name vllm \
          -p 8000:8000 \
          --privileged=true \
-          quay.io/higginsd/vllm-cpu:65393ee064 \
+          quay.io/higginsd/vllm-cpu:65393ee064-qwen3 \
          --host 0.0.0.0 \
          --port 8000 \
          --enable-auto-tool-choice \
-          --tool-call-parser llama3_json \
-          --model /root/.cache/Llama-3.2-1B-Instruct \
-          --served-model-name meta-llama/Llama-3.2-1B-Instruct
+          --tool-call-parser hermes \
+          --model /root/.cache/Qwen3-0.6B \
+          --served-model-name Qwen/Qwen3-0.6B \
+          --max-model-len 8192

          # Wait for vllm to be ready
          echo "Waiting for vllm to be ready..."
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -22,7 +22,7 @@ updates:
      prefix: chore(python-deps)

  - package-ecosystem: npm
-    directory: "/llama_stack/ui"
+    directory: "/llama_stack_ui"
    schedule:
      interval: "weekly"
      day: "saturday"
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -18,6 +18,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
+| Stainless SDK Builds | [stainless-builds.yml](stainless-builds.yml) | Build Stainless SDK from OpenAPI spec changes |
 | Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
 | Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
 | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -14,7 +14,7 @@ on:
    paths:
      - 'distributions/**'
      - 'src/llama_stack/**'
-      - '!src/llama_stack/ui/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -14,7 +14,7 @@ on:
    types: [opened, synchronize, reopened]
    paths:
      - 'src/llama_stack/**'
-      - '!src/llama_stack/ui/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/**'
      - 'uv.lock'
      - 'pyproject.toml'
@ -23,10 +23,10 @@ on:
      - '.github/actions/setup-test-environment/action.yml'
      - '.github/actions/run-and-record-tests/action.yml'
      - 'scripts/integration-tests.sh'
+      - 'scripts/generate_ci_matrix.py'
  schedule:
    # If changing the cron schedule, update the provider in the test-matrix job
    - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
-    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
  workflow_dispatch:
    inputs:
      test-all-client-versions:
@ -44,8 +44,27 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+
+      - name: Generate test matrix
+        id: set-matrix
+        run: |
+          # Generate matrix from CI_MATRIX in tests/integration/suites.py
+          # Supports schedule-based and manual input overrides
+          MATRIX=$(PYTHONPATH=. python3 scripts/generate_ci_matrix.py \
+            --schedule "${{ github.event.schedule }}" \
+            --test-setup "${{ github.event.inputs.test-setup }}")
+          echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
+          echo "Generated matrix: $MATRIX"

  run-replay-mode-tests:
+    needs: generate-matrix
    runs-on: ubuntu-latest
    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}

@ -56,18 +75,9 @@ jobs:
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        # Define (setup, suite) pairs - they are always matched and cannot be independent
-        # Weekly schedule (Sun 1 AM): vllm+base
-        # Input test-setup=ollama-vision: ollama-vision+vision
-        # Default (including test-setup=ollama): ollama+base, ollama-vision+vision, gpt+responses
-        config: >-
-          ${{
-            github.event.schedule == '1 0 * * 0'
-              && fromJSON('[{"setup": "vllm", "suite": "base"}]')
-            || github.event.inputs.test-setup == 'ollama-vision'
-              && fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
-            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}, {"setup": "gpt", "suite": "responses"}]')
-          }}
+        # Test configurations: Generated from CI_MATRIX in tests/integration/suites.py
+        # See scripts/generate_ci_matrix.py for generation logic
+        config: ${{ fromJSON(needs.generate-matrix.outputs.matrix).include }}

    steps:
      - name: Checkout repository
--- a/.github/workflows/integration-vector-io-tests.yml
+++ b/.github/workflows/integration-vector-io-tests.yml
@ -13,7 +13,7 @@ on:
      - 'release-[0-9]+.[0-9]+.x'
    paths:
      - 'src/llama_stack/**'
-      - '!src/llama_stack/ui/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/integration/vector_io/**'
      - 'uv.lock'
      - 'pyproject.toml'
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -43,14 +43,14 @@ jobs:
        with:
          node-version: '20'
          cache: 'npm'
-          cache-dependency-path: 'src/llama_stack/ui/'
+          cache-dependency-path: 'src/llama_stack_ui/'

      - name: Set up uv
        uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2

      - name: Install npm dependencies
        run: npm ci
-        working-directory: src/llama_stack/ui
+        working-directory: src/llama_stack_ui

      - name: Install pre-commit
        run: python -m pip install pre-commit
@ -165,3 +165,14 @@ jobs:
            echo "::error::Full mypy failed. Reproduce locally with 'uv run pre-commit run mypy-full --hook-stage manual --all-files'."
          fi
          exit $status
+
+      - name: Check if any unused recordings
+        run: |
+          set -e
+          PYTHONPATH=$PWD uv run ./scripts/cleanup_recordings.py --delete
+          changes=$(git status --short tests/integration | grep 'recordings' || true)
+          if [ -n "$changes" ]; then
+            echo "::error::Unused integration recordings detected. Run 'PYTHONPATH=$(pwd) uv run ./scripts/cleanup_recordings.py --delete' locally and commit the deletions."
+            echo "$changes"
+            exit 1
+          fi
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -10,7 +10,7 @@ on:
    branches:
      - main
    paths-ignore:
-        - 'src/llama_stack/ui/**'
+        - 'src/llama_stack_ui/**'

 jobs:
  build:
--- a/.github/workflows/stainless-builds.yml
+++ b/.github/workflows/stainless-builds.yml
@ -0,0 +1,110 @@
+name: Stainless SDK Builds
+run-name: Build Stainless SDK from OpenAPI spec changes
+
+# This workflow uses pull_request_target, which allows it to run on pull requests
+# from forks with access to secrets. This is safe because the workflow definition
+# comes from the base branch (trusted), and the action only reads OpenAPI spec
+# files without executing any code from the PR.
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - closed
+    paths:
+      - "client-sdks/stainless/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+
+env:
+  # Stainless organization name.
+  STAINLESS_ORG: llamastack
+
+  # Stainless project name.
+  STAINLESS_PROJECT: llama-stack-client
+
+  # Path to your OpenAPI spec.
+  OAS_PATH: ./client-sdks/stainless/openapi.yml
+
+  # Path to your Stainless config. Optional; only provide this if you prefer
+  # to maintain the ground truth Stainless config in your own repo.
+  CONFIG_PATH: ./client-sdks/stainless/config.yml
+
+  # When to fail the job based on build conclusion.
+  # Options: "never" | "note" | "warning" | "error" | "fatal".
+  FAIL_ON: error
+
+  # In your repo secrets, configure:
+  # - STAINLESS_API_KEY: a Stainless API key, which you can generate on the
+  #   Stainless organization dashboard
+
+jobs:
+  preview:
+    if: github.event.action != 'closed'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      # Checkout the PR's code to access the OpenAPI spec and config files.
+      # This is necessary to read the spec/config from the PR (including from forks).
+      - name: Checkout repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.sha }}
+          fetch-depth: 2
+
+      # This action builds preview SDKs from the OpenAPI spec changes and
+      # posts/updates a comment on the PR with build results and links to the preview.
+      - name: Run preview builds
+        uses: stainless-api/upload-openapi-spec-action/preview@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
+        with:
+          stainless_api_key: ${{ secrets.STAINLESS_API_KEY }}
+          org: ${{ env.STAINLESS_ORG }}
+          project: ${{ env.STAINLESS_PROJECT }}
+          oas_path: ${{ env.OAS_PATH }}
+          config_path: ${{ env.CONFIG_PATH }}
+          fail_on: ${{ env.FAIL_ON }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          base_ref: ${{ github.event.pull_request.base.ref }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
+
+  merge:
+    if: github.event.action == 'closed' && github.event.pull_request.merged == true
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      # Checkout the PR's code to access the OpenAPI spec and config files.
+      # This is necessary to read the spec/config from the PR (including from forks).
+      - name: Checkout repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.sha }}
+          fetch-depth: 2
+
+      # Note that this only merges in changes that happened on the last build on
+      # preview/${{ github.head_ref }}. It's possible that there are OAS/config
+      # changes that haven't been built, if the preview-sdk job didn't finish
+      # before this step starts. In theory we want to wait for all builds
+      # against preview/${{ github.head_ref }} to complete, but assuming that
+      # the preview-sdk job happens before the PR merge, it should be fine.
+      - name: Run merge build
+        uses: stainless-api/upload-openapi-spec-action/merge@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
+        with:
+          stainless_api_key: ${{ secrets.STAINLESS_API_KEY }}
+          org: ${{ env.STAINLESS_ORG }}
+          project: ${{ env.STAINLESS_PROJECT }}
+          oas_path: ${{ env.OAS_PATH }}
+          config_path: ${{ env.CONFIG_PATH }}
+          fail_on: ${{ env.FAIL_ON }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          base_ref: ${{ github.event.pull_request.base.ref }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/test-external.yml
+++ b/.github/workflows/test-external.yml
@ -9,7 +9,7 @@ on:
    branches: [ main ]
    paths:
      - 'src/llama_stack/**'
-      - '!src/llama_stack/ui/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
--- a/.github/workflows/ui-unit-tests.yml
+++ b/.github/workflows/ui-unit-tests.yml
@ -8,7 +8,7 @@ on:
  pull_request:
    branches: [ main ]
    paths:
-      - 'src/llama_stack/ui/**'
+      - 'src/llama_stack_ui/**'
      - '.github/workflows/ui-unit-tests.yml' # This workflow
  workflow_dispatch:

@ -33,22 +33,22 @@ jobs:
        with:
          node-version: ${{ matrix.node-version }}
          cache: 'npm'
-          cache-dependency-path: 'src/llama_stack/ui/package-lock.json'
+          cache-dependency-path: 'src/llama_stack_ui/package-lock.json'

      - name: Install dependencies
-        working-directory: src/llama_stack/ui
+        working-directory: src/llama_stack_ui
        run: npm ci

      - name: Run linting
-        working-directory: src/llama_stack/ui
+        working-directory: src/llama_stack_ui
        run: npm run lint

      - name: Run format check
-        working-directory: src/llama_stack/ui
+        working-directory: src/llama_stack_ui
        run: npm run format:check

      - name: Run unit tests
-        working-directory: src/llama_stack/ui
+        working-directory: src/llama_stack_ui
        env:
          CI: true

--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -13,7 +13,7 @@ on:
      - 'release-[0-9]+.[0-9]+.x'
    paths:
      - 'src/llama_stack/**'
-      - '!src/llama_stack/ui/**'
+      - '!src/llama_stack_ui/**'
      - 'tests/unit/**'
      - 'uv.lock'
      - 'pyproject.toml'
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -161,7 +161,7 @@ repos:
        name: Format & Lint UI
        entry: bash ./scripts/run-ui-linter.sh
        language: system
-        files: ^src/llama_stack/ui/.*\.(ts|tsx)$
+        files: ^src/llama_stack_ui/.*\.(ts|tsx)$
        pass_filenames: false
        require_serial: true

--- a/client-sdks/stainless/README.md
+++ b/client-sdks/stainless/README.md
@ -1,8 +1,8 @@
 These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless.

 - `openapi.yml`: this is the OpenAPI specification for the Llama Stack API.
- `openapi.stainless.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
+- `config.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.

 A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.

-These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.
+These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.
--- a/client-sdks/stainless/config.yml
+++ b/client-sdks/stainless/config.yml
@ -0,0 +1,521 @@
+# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
+
+organization:
+  # Name of your organization or company, used to determine the name of the client
+  # and headings.
+  name: llama-stack-client
+  docs: https://llama-stack.readthedocs.io/en/latest/
+  contact: llamastack@meta.com
+security:
+  - {}
+  - BearerAuth: []
+security_schemes:
+  BearerAuth:
+    type: http
+    scheme: bearer
+# `targets` define the output targets and their customization options, such as
+# whether to emit the Node SDK and what it's package name should be.
+targets:
+  node:
+    package_name: llama-stack-client
+    production_repo: llamastack/llama-stack-client-typescript
+    publish:
+      npm: false
+  python:
+    package_name: llama_stack_client
+    production_repo: llamastack/llama-stack-client-python
+    options:
+      use_uv: true
+    publish:
+      pypi: true
+    project_name: llama_stack_client
+  kotlin:
+    reverse_domain: com.llama_stack_client.api
+    production_repo: null
+    publish:
+      maven: false
+  go:
+    package_name: llama-stack-client
+    production_repo: llamastack/llama-stack-client-go
+    options:
+      enable_v2: true
+      back_compat_use_shared_package: false
+
+# `client_settings` define settings for the API client, such as extra constructor
+# arguments (used for authentication), retry behavior, idempotency, etc.
+client_settings:
+  default_env_prefix: LLAMA_STACK_CLIENT
+  opts:
+    api_key:
+      type: string
+      read_env: LLAMA_STACK_CLIENT_API_KEY
+      auth: { security_scheme: BearerAuth }
+      nullable: true
+
+# `environments` are a map of the name of the environment (e.g. "sandbox",
+# "production") to the corresponding url to use.
+environments:
+  production: http://any-hosted-llama-stack.com
+
+# `pagination` defines [pagination schemes] which provides a template to match
+# endpoints and generate next-page and auto-pagination helpers in the SDKs.
+pagination:
+  - name: datasets_iterrows
+    type: offset
+    request:
+      dataset_id:
+        type: string
+      start_index:
+        type: integer
+        x-stainless-pagination-property:
+          purpose: offset_count_param
+      limit:
+        type: integer
+    response:
+      data:
+        type: array
+        items:
+          type: object
+      next_index:
+        type: integer
+        x-stainless-pagination-property:
+          purpose: offset_count_start_field
+  - name: openai_cursor_page
+    type: cursor
+    request:
+      limit:
+        type: integer
+      after:
+        type: string
+        x-stainless-pagination-property:
+          purpose: next_cursor_param
+    response:
+      data:
+        type: array
+        items: {}
+      has_more:
+        type: boolean
+      last_id:
+        type: string
+        x-stainless-pagination-property:
+          purpose: next_cursor_field
+# `resources` define the structure and organziation for your API, such as how
+# methods and models are grouped together and accessed. See the [configuration
+# guide] for more information.
+#
+# [configuration guide]:
+#   https://app.stainlessapi.com/docs/guides/configure#resources
+resources:
+  $shared:
+    models:
+      interleaved_content_item: InterleavedContentItem
+      interleaved_content: InterleavedContent
+      param_type: ParamType
+      safety_violation: SafetyViolation
+      sampling_params: SamplingParams
+      scoring_result: ScoringResult
+      system_message: SystemMessage
+      query_result: RAGQueryResult
+      document: RAGDocument
+      query_config: RAGQueryConfig
+  toolgroups:
+    models:
+      tool_group: ToolGroup
+      list_tool_groups_response: ListToolGroupsResponse
+    methods:
+      register: post /v1/toolgroups
+      get: get /v1/toolgroups/{toolgroup_id}
+      list: get /v1/toolgroups
+      unregister: delete /v1/toolgroups/{toolgroup_id}
+  tools:
+    methods:
+      get: get /v1/tools/{tool_name}
+      list:
+        endpoint: get /v1/tools
+        paginated: false
+
+  tool_runtime:
+    models:
+      tool_def: ToolDef
+      tool_invocation_result: ToolInvocationResult
+    methods:
+      list_tools:
+        endpoint: get /v1/tool-runtime/list-tools
+        paginated: false
+      invoke_tool: post /v1/tool-runtime/invoke
+    subresources:
+      rag_tool:
+        methods:
+          insert: post /v1/tool-runtime/rag-tool/insert
+          query: post /v1/tool-runtime/rag-tool/query
+
+  responses:
+    models:
+      response_object_stream: OpenAIResponseObjectStream
+      response_object: OpenAIResponseObject
+    methods:
+      create:
+        type: http
+        endpoint: post /v1/responses
+        streaming:
+          stream_event_model: responses.response_object_stream
+          param_discriminator: stream
+      retrieve: get /v1/responses/{response_id}
+      list:
+        type: http
+        endpoint: get /v1/responses
+      delete:
+        type: http
+        endpoint: delete /v1/responses/{response_id}
+    subresources:
+      input_items:
+        methods:
+          list:
+            type: http
+            endpoint: get /v1/responses/{response_id}/input_items
+
+  prompts:
+    models:
+      prompt: Prompt
+      list_prompts_response: ListPromptsResponse
+    methods:
+      create: post /v1/prompts
+      list:
+        endpoint: get /v1/prompts
+        paginated: false
+      retrieve: get /v1/prompts/{prompt_id}
+      update: post /v1/prompts/{prompt_id}
+      delete: delete /v1/prompts/{prompt_id}
+      set_default_version: post /v1/prompts/{prompt_id}/set-default-version
+    subresources:
+      versions:
+        methods:
+          list:
+            endpoint: get /v1/prompts/{prompt_id}/versions
+            paginated: false
+
+  conversations:
+    models:
+      conversation_object: Conversation
+    methods:
+      create:
+        type: http
+        endpoint: post /v1/conversations
+      retrieve: get /v1/conversations/{conversation_id}
+      update:
+        type: http
+        endpoint: post /v1/conversations/{conversation_id}
+      delete:
+        type: http
+        endpoint: delete /v1/conversations/{conversation_id}
+    subresources:
+      items:
+        methods:
+          get:
+            type: http
+            endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
+          list:
+            type: http
+            endpoint: get /v1/conversations/{conversation_id}/items
+          create:
+            type: http
+            endpoint: post /v1/conversations/{conversation_id}/items
+
+  inspect:
+    models:
+      healthInfo: HealthInfo
+      providerInfo: ProviderInfo
+      routeInfo: RouteInfo
+      versionInfo: VersionInfo
+    methods:
+      health: get /v1/health
+      version: get /v1/version
+
+  embeddings:
+    models:
+      create_embeddings_response: OpenAIEmbeddingsResponse
+    methods:
+      create: post /v1/embeddings
+
+  chat:
+    models:
+      chat_completion_chunk: OpenAIChatCompletionChunk
+    subresources:
+      completions:
+        methods:
+          create:
+            type: http
+            endpoint: post /v1/chat/completions
+            streaming:
+              stream_event_model: chat.chat_completion_chunk
+              param_discriminator: stream
+          list:
+            type: http
+            endpoint: get /v1/chat/completions
+          retrieve:
+            type: http
+            endpoint: get /v1/chat/completions/{completion_id}
+  completions:
+    methods:
+      create:
+        type: http
+        endpoint: post /v1/completions
+        streaming:
+          param_discriminator: stream
+
+  vector_io:
+    models:
+      queryChunksResponse: QueryChunksResponse
+    methods:
+      insert: post /v1/vector-io/insert
+      query: post /v1/vector-io/query
+
+  vector_stores:
+    models:
+      vector_store: VectorStoreObject
+      list_vector_stores_response: VectorStoreListResponse
+      vector_store_delete_response: VectorStoreDeleteResponse
+      vector_store_search_response: VectorStoreSearchResponsePage
+    methods:
+      create: post /v1/vector_stores
+      list:
+        endpoint: get /v1/vector_stores
+      retrieve: get /v1/vector_stores/{vector_store_id}
+      update: post /v1/vector_stores/{vector_store_id}
+      delete: delete /v1/vector_stores/{vector_store_id}
+      search: post /v1/vector_stores/{vector_store_id}/search
+    subresources:
+      files:
+        models:
+          vector_store_file: VectorStoreFileObject
+        methods:
+          list: get /v1/vector_stores/{vector_store_id}/files
+          retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
+          update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
+          delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
+          create: post /v1/vector_stores/{vector_store_id}/files
+          content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
+      file_batches:
+        models:
+          vector_store_file_batches: VectorStoreFileBatchObject
+          list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
+        methods:
+          create: post /v1/vector_stores/{vector_store_id}/file_batches
+          retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
+          list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
+          cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
+
+  models:
+    models:
+      model: OpenAIModel
+      list_models_response: OpenAIListModelsResponse
+    methods:
+      list:
+        endpoint: get /v1/models
+        paginated: false
+      retrieve: get /v1/models/{model_id}
+      register: post /v1/models
+      unregister: delete /v1/models/{model_id}
+    subresources:
+      openai:
+        methods:
+          list:
+            endpoint: get /v1/models
+            paginated: false
+
+  providers:
+    models:
+      list_providers_response: ListProvidersResponse
+    methods:
+      list:
+        endpoint: get /v1/providers
+        paginated: false
+      retrieve: get /v1/providers/{provider_id}
+
+  routes:
+    models:
+      list_routes_response: ListRoutesResponse
+    methods:
+      list:
+        endpoint: get /v1/inspect/routes
+        paginated: false
+
+  moderations:
+    models:
+      create_response: ModerationObject
+    methods:
+      create: post /v1/moderations
+
+  safety:
+    models:
+      run_shield_response: RunShieldResponse
+    methods:
+      run_shield: post /v1/safety/run-shield
+
+  shields:
+    models:
+      shield: Shield
+      list_shields_response: ListShieldsResponse
+    methods:
+      retrieve: get /v1/shields/{identifier}
+      list:
+        endpoint: get /v1/shields
+        paginated: false
+      register: post /v1/shields
+      delete: delete /v1/shields/{identifier}
+
+  scoring:
+    methods:
+      score: post /v1/scoring/score
+      score_batch: post /v1/scoring/score-batch
+  scoring_functions:
+    methods:
+      retrieve: get /v1/scoring-functions/{scoring_fn_id}
+      list:
+        endpoint: get /v1/scoring-functions
+        paginated: false
+      register: post /v1/scoring-functions
+    models:
+      scoring_fn: ScoringFn
+      scoring_fn_params: ScoringFnParams
+      list_scoring_functions_response: ListScoringFunctionsResponse
+
+  files:
+    methods:
+      create: post /v1/files
+      list: get /v1/files
+      retrieve: get /v1/files/{file_id}
+      delete: delete /v1/files/{file_id}
+      content: get /v1/files/{file_id}/content
+    models:
+      file: OpenAIFileObject
+      list_files_response: ListOpenAIFileResponse
+      delete_file_response: OpenAIFileDeleteResponse
+
+  alpha:
+    subresources:
+      inference:
+        methods:
+          rerank: post /v1alpha/inference/rerank
+
+      post_training:
+        models:
+          algorithm_config: AlgorithmConfig
+          post_training_job: PostTrainingJob
+          list_post_training_jobs_response: ListPostTrainingJobsResponse
+        methods:
+          preference_optimize: post /v1alpha/post-training/preference-optimize
+          supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
+        subresources:
+          job:
+            methods:
+              artifacts: get /v1alpha/post-training/job/artifacts
+              cancel: post /v1alpha/post-training/job/cancel
+              status: get /v1alpha/post-training/job/status
+              list:
+                endpoint: get /v1alpha/post-training/jobs
+                paginated: false
+
+      benchmarks:
+        methods:
+          retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
+          list:
+            endpoint: get /v1alpha/eval/benchmarks
+            paginated: false
+          register: post /v1alpha/eval/benchmarks
+        models:
+          benchmark: Benchmark
+          list_benchmarks_response: ListBenchmarksResponse
+
+      eval:
+        methods:
+          evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
+          run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
+          evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
+          run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
+
+        subresources:
+          jobs:
+            methods:
+              cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
+              status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
+              retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
+        models:
+          evaluate_response: EvaluateResponse
+          benchmark_config: BenchmarkConfig
+          job: Job
+
+  beta:
+    subresources:
+      datasets:
+        models:
+          list_datasets_response: ListDatasetsResponse
+        methods:
+          register: post /v1beta/datasets
+          retrieve: get /v1beta/datasets/{dataset_id}
+          list:
+            endpoint: get /v1beta/datasets
+            paginated: false
+          unregister: delete /v1beta/datasets/{dataset_id}
+          iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
+          appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
+
+settings:
+  license: MIT
+  unwrap_response_fields: [data]
+
+openapi:
+  transformations:
+    - command: mergeObject
+      reason: Better return_type using enum
+      args:
+        target:
+          - "$.components.schemas"
+        object:
+          ReturnType:
+            additionalProperties: false
+            properties:
+              type:
+                enum:
+                  - string
+                  - number
+                  - boolean
+                  - array
+                  - object
+                  - json
+                  - union
+                  - chat_completion_input
+                  - completion_input
+                  - agent_turn_input
+            required:
+              - type
+            type: object
+    - command: replaceProperties
+      reason: Replace return type properties with better model (see above)
+      args:
+        filter:
+          only:
+            - "$.components.schemas.ScoringFn.properties.return_type"
+            - "$.components.schemas.RegisterScoringFunctionRequest.properties.return_type"
+        value:
+          $ref: "#/components/schemas/ReturnType"
+    - command: oneOfToAnyOf
+      reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
+
+# `readme` is used to configure the code snippets that will be rendered in the
+# README.md of various SDKs. In particular, you can change the `headline`
+# snippet's endpoint and the arguments to call it with.
+readme:
+  example_requests:
+    default:
+      type: request
+      endpoint: post /v1/chat/completions
+      params: &ref_0 {}
+    headline:
+      type: request
+      endpoint: post /v1/models
+      params: *ref_0
+    pagination:
+      type: request
+      endpoint: post /v1/chat/completions
+      params: {}
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@ -2055,69 +2055,6 @@ paths:
          schema:
            $ref: '#/components/schemas/URL'
      deprecated: false
-  /v1/tool-runtime/rag-tool/insert:
-    post:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Index documents so they can be used by the RAG system.
-      description: >-
-        Index documents so they can be used by the RAG system.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/InsertRequest'
-        required: true
-      deprecated: false
-  /v1/tool-runtime/rag-tool/query:
-    post:
-      responses:
-        '200':
-          description: >-
-            RAGQueryResult containing the retrieved content and metadata
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RAGQueryResult'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Query the RAG system for context; typically invoked by the agent.
-      description: >-
-        Query the RAG system for context; typically invoked by the agent.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/QueryRequest'
-        required: true
-      deprecated: false
  /v1/toolgroups:
    get:
      responses:
@ -6854,6 +6791,8 @@ components:
              const: web_search_preview
            - type: string
              const: web_search_preview_2025_03_11
+            - type: string
+              const: web_search_2025_08_26
          default: web_search
          description: Web search tool type variant to use
        search_context_size:
@ -9633,274 +9572,6 @@ components:
      title: ListToolDefsResponse
      description: >-
        Response containing a list of tool definitions.
-    RAGDocument:
-      type: object
-      properties:
-        document_id:
-          type: string
-          description: The unique identifier for the document.
-        content:
-          oneOf:
-            - type: string
-            - $ref: '#/components/schemas/InterleavedContentItem'
-            - type: array
-              items:
-                $ref: '#/components/schemas/InterleavedContentItem'
-            - $ref: '#/components/schemas/URL'
-          description: The content of the document.
-        mime_type:
-          type: string
-          description: The MIME type of the document.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Additional metadata for the document.
-      additionalProperties: false
-      required:
-        - document_id
-        - content
-        - metadata
-      title: RAGDocument
-      description: >-
-        A document to be used for document ingestion in the RAG Tool.
-    InsertRequest:
-      type: object
-      properties:
-        documents:
-          type: array
-          items:
-            $ref: '#/components/schemas/RAGDocument'
-          description: >-
-            List of documents to index in the RAG system
-        vector_store_id:
-          type: string
-          description: >-
-            ID of the vector database to store the document embeddings
-        chunk_size_in_tokens:
-          type: integer
-          description: >-
-            (Optional) Size in tokens for document chunking during indexing
-      additionalProperties: false
-      required:
-        - documents
-        - vector_store_id
-        - chunk_size_in_tokens
-      title: InsertRequest
-    DefaultRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: default
-          default: default
-          description: >-
-            Type of query generator, always 'default'
-        separator:
-          type: string
-          default: ' '
-          description: >-
-            String separator used to join query terms
-      additionalProperties: false
-      required:
-        - type
-        - separator
-      title: DefaultRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the default RAG query generator.
-    LLMRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm
-          default: llm
-          description: Type of query generator, always 'llm'
-        model:
-          type: string
-          description: >-
-            Name of the language model to use for query generation
-        template:
-          type: string
-          description: >-
-            Template string for formatting the query generation prompt
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - template
-      title: LLMRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the LLM-based RAG query generator.
-    RAGQueryConfig:
-      type: object
-      properties:
-        query_generator_config:
-          oneOf:
-            - $ref: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-            - $ref: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          discriminator:
-            propertyName: type
-            mapping:
-              default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-              llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          description: Configuration for the query generator.
-        max_tokens_in_context:
-          type: integer
-          default: 4096
-          description: Maximum number of tokens in the context.
-        max_chunks:
-          type: integer
-          default: 5
-          description: Maximum number of chunks to retrieve.
-        chunk_template:
-          type: string
-          default: >
-            Result {index}
-
-            Content: {chunk.content}
-
-            Metadata: {metadata}
-          description: >-
-            Template for formatting each retrieved chunk in the context. Available
-            placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk
-            content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
-            {chunk.content}\nMetadata: {metadata}\n"
-        mode:
-          $ref: '#/components/schemas/RAGSearchMode'
-          default: vector
-          description: >-
-            Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
-            "vector".
-        ranker:
-          $ref: '#/components/schemas/Ranker'
-          description: >-
-            Configuration for the ranker to use in hybrid search. Defaults to RRF
-            ranker.
-      additionalProperties: false
-      required:
-        - query_generator_config
-        - max_tokens_in_context
-        - max_chunks
-        - chunk_template
-      title: RAGQueryConfig
-      description: >-
-        Configuration for the RAG query generation.
-    RAGSearchMode:
-      type: string
-      enum:
-        - vector
-        - keyword
-        - hybrid
-      title: RAGSearchMode
-      description: >-
-        Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
-        for semantic matching - KEYWORD: Uses keyword-based search for exact matching
-        - HYBRID: Combines both vector and keyword search for better results
-    RRFRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: rrf
-          default: rrf
-          description: The type of ranker, always "rrf"
-        impact_factor:
-          type: number
-          default: 60.0
-          description: >-
-            The impact factor for RRF scoring. Higher values give more weight to higher-ranked
-            results. Must be greater than 0
-      additionalProperties: false
-      required:
-        - type
-        - impact_factor
-      title: RRFRanker
-      description: >-
-        Reciprocal Rank Fusion (RRF) ranker configuration.
-    Ranker:
-      oneOf:
-        - $ref: '#/components/schemas/RRFRanker'
-        - $ref: '#/components/schemas/WeightedRanker'
-      discriminator:
-        propertyName: type
-        mapping:
-          rrf: '#/components/schemas/RRFRanker'
-          weighted: '#/components/schemas/WeightedRanker'
-    WeightedRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: weighted
-          default: weighted
-          description: The type of ranker, always "weighted"
-        alpha:
-          type: number
-          default: 0.5
-          description: >-
-            Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
-            only use vector scores, values in between blend both scores.
-      additionalProperties: false
-      required:
-        - type
-        - alpha
-      title: WeightedRanker
-      description: >-
-        Weighted ranker configuration that combines vector and keyword scores.
-    QueryRequest:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The query content to search for in the indexed documents
-        vector_store_ids:
-          type: array
-          items:
-            type: string
-          description: >-
-            List of vector database IDs to search within
-        query_config:
-          $ref: '#/components/schemas/RAGQueryConfig'
-          description: >-
-            (Optional) Configuration parameters for the query operation
-      additionalProperties: false
-      required:
-        - content
-        - vector_store_ids
-      title: QueryRequest
-    RAGQueryResult:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            (Optional) The retrieved content from the query
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            Additional metadata about the query result
-      additionalProperties: false
-      required:
-        - metadata
-      title: RAGQueryResult
-      description: >-
-        Result of a RAG query containing retrieved content and metadata.
    ToolGroup:
      type: object
      properties:
@ -10307,6 +9978,70 @@ components:
        - metadata
      title: VectorStoreObject
      description: OpenAI Vector Store object.
+    VectorStoreChunkingStrategy:
+      oneOf:
+        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+      discriminator:
+        propertyName: type
+        mapping:
+          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+    VectorStoreChunkingStrategyAuto:
+      type: object
+      properties:
+        type:
+          type: string
+          const: auto
+          default: auto
+          description: >-
+            Strategy type, always "auto" for automatic chunking
+      additionalProperties: false
+      required:
+        - type
+      title: VectorStoreChunkingStrategyAuto
+      description: >-
+        Automatic chunking strategy for vector store files.
+    VectorStoreChunkingStrategyStatic:
+      type: object
+      properties:
+        type:
+          type: string
+          const: static
+          default: static
+          description: >-
+            Strategy type, always "static" for static chunking
+        static:
+          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
+          description: >-
+            Configuration parameters for the static chunking strategy
+      additionalProperties: false
+      required:
+        - type
+        - static
+      title: VectorStoreChunkingStrategyStatic
+      description: >-
+        Static chunking strategy with configurable parameters.
+    VectorStoreChunkingStrategyStaticConfig:
+      type: object
+      properties:
+        chunk_overlap_tokens:
+          type: integer
+          default: 400
+          description: >-
+            Number of tokens to overlap between adjacent chunks
+        max_chunk_size_tokens:
+          type: integer
+          default: 800
+          description: >-
+            Maximum number of tokens per chunk, must be between 100 and 4096
+      additionalProperties: false
+      required:
+        - chunk_overlap_tokens
+        - max_chunk_size_tokens
+      title: VectorStoreChunkingStrategyStaticConfig
+      description: >-
+        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreRequestWithExtraBody":
      type: object
      properties:
@ -10332,15 +10067,7 @@ components:
          description: >-
            (Optional) Expiration policy for the vector store
        chunking_strategy:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
          description: >-
            (Optional) Strategy for splitting files into chunks
        metadata:
@ -10416,70 +10143,6 @@ components:
        - deleted
      title: VectorStoreDeleteResponse
      description: Response from deleting a vector store.
-    VectorStoreChunkingStrategy:
-      oneOf:
-        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
-        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
-      discriminator:
-        propertyName: type
-        mapping:
-          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
-          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
-    VectorStoreChunkingStrategyAuto:
-      type: object
-      properties:
-        type:
-          type: string
-          const: auto
-          default: auto
-          description: >-
-            Strategy type, always "auto" for automatic chunking
-      additionalProperties: false
-      required:
-        - type
-      title: VectorStoreChunkingStrategyAuto
-      description: >-
-        Automatic chunking strategy for vector store files.
-    VectorStoreChunkingStrategyStatic:
-      type: object
-      properties:
-        type:
-          type: string
-          const: static
-          default: static
-          description: >-
-            Strategy type, always "static" for static chunking
-        static:
-          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
-          description: >-
-            Configuration parameters for the static chunking strategy
-      additionalProperties: false
-      required:
-        - type
-        - static
-      title: VectorStoreChunkingStrategyStatic
-      description: >-
-        Static chunking strategy with configurable parameters.
-    VectorStoreChunkingStrategyStaticConfig:
-      type: object
-      properties:
-        chunk_overlap_tokens:
-          type: integer
-          default: 400
-          description: >-
-            Number of tokens to overlap between adjacent chunks
-        max_chunk_size_tokens:
-          type: integer
-          default: 800
-          description: >-
-            Maximum number of tokens per chunk, must be between 100 and 4096
-      additionalProperties: false
-      required:
-        - chunk_overlap_tokens
-        - max_chunk_size_tokens
-      title: VectorStoreChunkingStrategyStaticConfig
-      description: >-
-        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreFileBatchRequestWithExtraBody":
      type: object
      properties:
@ -10937,7 +10600,9 @@ components:
          description: >-
            Object type identifier for the search results page
        search_query:
-          type: string
+          type: array
+          items:
+            type: string
          description: >-
            The original search query that was executed
        data:
--- a/containers/Containerfile
+++ b/containers/Containerfile
@ -47,7 +47,7 @@ RUN set -eux; \
        exit 1; \
    fi

-RUN pip install --no-cache-dir uv
+RUN pip install --no-cache uv
 ENV UV_SYSTEM_PYTHON=1

 ENV INSTALL_MODE=${INSTALL_MODE}
@ -72,7 +72,7 @@ RUN set -eux; \
            echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
            exit 1; \
        fi; \
-        uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"; \
+        uv pip install --no-cache -e "$LLAMA_STACK_CLIENT_DIR"; \
    fi;

 # Install llama-stack
@ -88,22 +88,22 @@ RUN set -eux; \
        fi; \
        if [ -n "$SAVED_UV_EXTRA_INDEX_URL" ] && [ -n "$SAVED_UV_INDEX_STRATEGY" ]; then \
            UV_EXTRA_INDEX_URL="$SAVED_UV_EXTRA_INDEX_URL" UV_INDEX_STRATEGY="$SAVED_UV_INDEX_STRATEGY" \
-                uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \
+                uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
        else \
-            uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \
+            uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
        fi; \
    elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
-        uv pip install --no-cache-dir fastapi libcst; \
+        uv pip install --no-cache fastapi libcst; \
        if [ -n "$TEST_PYPI_VERSION" ]; then \
-            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
+            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
        else \
-            uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
+            uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
        fi; \
    else \
        if [ -n "$PYPI_VERSION" ]; then \
-            uv pip install --no-cache-dir "llama-stack==$PYPI_VERSION"; \
+            uv pip install --no-cache "llama-stack==$PYPI_VERSION"; \
        else \
-            uv pip install --no-cache-dir llama-stack; \
+            uv pip install --no-cache llama-stack; \
        fi; \
    fi;

@ -117,7 +117,7 @@ RUN set -eux; \
    fi; \
    deps="$(llama stack list-deps "$DISTRO_NAME")"; \
    if [ -n "$deps" ]; then \
-        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache-dir; \
+        printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache; \
    fi

 # Cleanup
--- a/docs/docs/building_applications/index.mdx
+++ b/docs/docs/building_applications/index.mdx
@ -35,9 +35,6 @@ Here are the key topics that will help you build effective AI applications:
 - **[Telemetry](./telemetry.mdx)** - Monitor and analyze your agents' performance and behavior
 - **[Safety](./safety.mdx)** - Implement guardrails and safety measures to ensure responsible AI behavior

-### 🎮 **Interactive Development**
- **[Playground](./playground.mdx)** - Interactive environment for testing and developing applications
-
 ## Application Patterns

 ### 🤖 **Conversational Agents**
--- a/docs/docs/building_applications/playground.mdx
+++ b/docs/docs/building_applications/playground.mdx
@ -1,298 +0,0 @@
---
-title: Llama Stack Playground
-description: Interactive interface to explore and experiment with Llama Stack capabilities
-sidebar_label: Playground
-sidebar_position: 10
---
-
-import Tabs from '@theme/Tabs';
-import TabItem from '@theme/TabItem';
-
-# Llama Stack Playground
-
-:::note[Experimental Feature]
-The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
-:::
-
-The Llama Stack Playground is a simple interface that aims to:
- **Showcase capabilities and concepts** of Llama Stack in an interactive environment
- **Demo end-to-end application code** to help users get started building their own applications
- **Provide a UI** to help users inspect and understand Llama Stack API providers and resources
-
-## Key Features
-
-### Interactive Playground Pages
-
-The playground provides interactive pages for users to explore Llama Stack API capabilities:
-
-#### Chatbot Interface
-
-<video
-  controls
-  autoPlay
-  playsInline
-  muted
-  loop
-  style={{width: '100%'}}
->
-  <source src="https://github.com/user-attachments/assets/8d2ef802-5812-4a28-96e1-316038c84cbf" type="video/mp4" />
-  Your browser does not support the video tag.
-</video>
-
-<Tabs>
-<TabItem value="chat" label="Chat">
-
-**Simple Chat Interface**
- Chat directly with Llama models through an intuitive interface
- Uses the `/chat/completions` streaming API under the hood
- Real-time message streaming for responsive interactions
- Perfect for testing model capabilities and prompt engineering
-
-</TabItem>
-<TabItem value="rag" label="RAG Chat">
-
-**Document-Aware Conversations**
- Upload documents to create memory banks
- Chat with a RAG-enabled agent that can query your documents
- Uses Llama Stack's `/agents` API to create and manage RAG sessions
- Ideal for exploring knowledge-enhanced AI applications
-
-</TabItem>
-</Tabs>
-
-#### Evaluation Interface
-
-<video
-  controls
-  autoPlay
-  playsInline
-  muted
-  loop
-  style={{width: '100%'}}
->
-  <source src="https://github.com/user-attachments/assets/6cc1659f-eba4-49ca-a0a5-7c243557b4f5" type="video/mp4" />
-  Your browser does not support the video tag.
-</video>
-
-<Tabs>
-<TabItem value="scoring" label="Scoring Evaluations">
-
-**Custom Dataset Evaluation**
- Upload your own evaluation datasets
- Run evaluations using available scoring functions
- Uses Llama Stack's `/scoring` API for flexible evaluation workflows
- Great for testing application performance on custom metrics
-
-</TabItem>
-<TabItem value="benchmarks" label="Benchmark Evaluations">
-
-<video
-  controls
-  autoPlay
-  playsInline
-  muted
-  loop
-  style={{width: '100%', marginBottom: '1rem'}}
->
-  <source src="https://github.com/user-attachments/assets/345845c7-2a2b-4095-960a-9ae40f6a93cf" type="video/mp4" />
-  Your browser does not support the video tag.
-</video>
-
-**Pre-registered Evaluation Tasks**
- Evaluate models or agents on pre-defined tasks
- Uses Llama Stack's `/eval` API for comprehensive evaluation
- Combines datasets and scoring functions for standardized testing
-
-**Setup Requirements:**
-Register evaluation datasets and benchmarks first:
-
-```bash
-# Register evaluation dataset
-llama-stack-client datasets register \
-  --dataset-id "mmlu" \
-  --provider-id "huggingface" \
-  --url "https://huggingface.co/datasets/llamastack/evals" \
-  --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
-  --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string"}, "chat_completion_input": {"type": "string"}}'
-
-# Register benchmark task
-llama-stack-client benchmarks register \
-  --eval-task-id meta-reference-mmlu \
-  --provider-id meta-reference \
-  --dataset-id mmlu \
-  --scoring-functions basic::regex_parser_multiple_choice_answer
-```
-
-</TabItem>
-</Tabs>
-
-#### Inspection Interface
-
-<video
-  controls
-  autoPlay
-  playsInline
-  muted
-  loop
-  style={{width: '100%'}}
->
-  <source src="https://github.com/user-attachments/assets/01d52b2d-92af-4e3a-b623-a9b8ba22ba99" type="video/mp4" />
-  Your browser does not support the video tag.
-</video>
-
-<Tabs>
-<TabItem value="providers" label="API Providers">
-
-**Provider Management**
- Inspect available Llama Stack API providers
- View provider configurations and capabilities
- Uses the `/providers` API for real-time provider information
- Essential for understanding your deployment's capabilities
-
-</TabItem>
-<TabItem value="resources" label="API Resources">
-
-**Resource Exploration**
- Inspect Llama Stack API resources including:
-  - **Models**: Available language models
-  - **Datasets**: Registered evaluation datasets
-  - **Memory Banks**: Vector databases and knowledge stores
-  - **Benchmarks**: Evaluation tasks and scoring functions
-  - **Shields**: Safety and content moderation tools
- Uses `/<resources>/list` APIs for comprehensive resource visibility
- For detailed information about resources, see [Core Concepts](/docs/concepts)
-
-</TabItem>
-</Tabs>
-
-## Getting Started
-
-### Quick Start Guide
-
-<Tabs>
-<TabItem value="setup" label="Setup">
-
-**1. Start the Llama Stack API Server**
-
-```bash
-llama stack list-deps together | xargs -L1 uv pip install
-llama stack run together
-```
-
-**2. Start the Streamlit UI**
-
-```bash
-# Launch the playground interface
-uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
-```
-
-</TabItem>
-<TabItem value="usage" label="Usage Tips">
-
-**Making the Most of the Playground:**
-
- **Start with Chat**: Test basic model interactions and prompt engineering
- **Explore RAG**: Upload sample documents to see knowledge-enhanced responses
- **Try Evaluations**: Use the scoring interface to understand evaluation metrics
- **Inspect Resources**: Check what providers and resources are available
- **Experiment with Settings**: Adjust parameters to see how they affect results
-
-</TabItem>
-</Tabs>
-
-### Available Distributions
-
-The playground works with any Llama Stack distribution. Popular options include:
-
-<Tabs>
-<TabItem value="together" label="Together AI">
-
-```bash
-llama stack list-deps together | xargs -L1 uv pip install
-llama stack run together
-```
-
-**Features:**
- Cloud-hosted models
- Fast inference
- Multiple model options
-
-</TabItem>
-<TabItem value="ollama" label="Ollama (Local)">
-
-```bash
-llama stack list-deps ollama | xargs -L1 uv pip install
-llama stack run ollama
-```
-
-**Features:**
- Local model execution
- Privacy-focused
- No internet required
-
-</TabItem>
-<TabItem value="meta-reference" label="Meta Reference">
-
-```bash
-llama stack list-deps meta-reference | xargs -L1 uv pip install
-llama stack run meta-reference
-```
-
-**Features:**
- Reference implementation
- All API features available
- Best for development
-
-</TabItem>
-</Tabs>
-
-## Use Cases & Examples
-
-### Educational Use Cases
- **Learning Llama Stack**: Hands-on exploration of API capabilities
- **Prompt Engineering**: Interactive testing of different prompting strategies
- **RAG Experimentation**: Understanding how document retrieval affects responses
- **Evaluation Understanding**: See how different metrics evaluate model performance
-
-### Development Use Cases
- **Prototype Testing**: Quick validation of application concepts
- **API Exploration**: Understanding available endpoints and parameters
- **Integration Planning**: Seeing how different components work together
- **Demo Creation**: Showcasing Llama Stack capabilities to stakeholders
-
-### Research Use Cases
- **Model Comparison**: Side-by-side testing of different models
- **Evaluation Design**: Understanding how scoring functions work
- **Safety Testing**: Exploring shield effectiveness with different inputs
- **Performance Analysis**: Measuring model behavior across different scenarios
-
-## Best Practices
-
-### 🚀 **Getting Started**
- Begin with simple chat interactions to understand basic functionality
- Gradually explore more advanced features like RAG and evaluations
- Use the inspection tools to understand your deployment's capabilities
-
-### 🔧 **Development Workflow**
- Use the playground to prototype before writing application code
- Test different parameter settings interactively
- Validate evaluation approaches before implementing them programmatically
-
-### 📊 **Evaluation & Testing**
- Start with simple scoring functions before trying complex evaluations
- Use the playground to understand evaluation results before automation
- Test safety features with various input types
-
-### 🎯 **Production Preparation**
- Use playground insights to inform your production API usage
- Test edge cases and error conditions interactively
- Validate resource configurations before deployment
-
-## Related Resources
-
- **[Getting Started Guide](../getting_started/quickstart)** - Complete setup and introduction
- **[Core Concepts](/docs/concepts)** - Understanding Llama Stack fundamentals
- **[Agents](./agent)** - Building intelligent agents
- **[RAG (Retrieval Augmented Generation)](./rag)** - Knowledge-enhanced applications
- **[Evaluations](./evals)** - Comprehensive evaluation framework
- **[API Reference](/docs/api/llama-stack-specification)** - Complete API documentation
--- a/docs/docs/distributions/importing_as_library.mdx
+++ b/docs/docs/distributions/importing_as_library.mdx
@ -11,7 +11,7 @@ If you are planning to use an external service for Inference (even Ollama or TGI
 This avoids the overhead of setting up a server.
 ```bash
 # setup
-uv pip install llama-stack
+uv pip install llama-stack llama-stack-client
 llama stack list-deps starter | xargs -L1 uv pip install
 ```

--- a/docs/docs/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/docs/distributions/k8s/ui-k8s.yaml.template
@ -44,7 +44,7 @@ spec:

            # Navigate to the UI directory
            echo "Navigating to UI directory..."
-            cd /app/llama_stack/ui
+            cd /app/llama_stack_ui

            # Check if package.json exists
            if [ ! -f "package.json" ]; then
--- a/docs/docs/distributions/self_hosted_distro/starter.md
+++ b/docs/docs/distributions/self_hosted_distro/starter.md
@ -163,7 +163,41 @@ docker run \
  --port $LLAMA_STACK_PORT
 ```

-### Via venv
+The container will run the distribution with a SQLite store by default. This store is used for the following components:
+
+- Metadata store: store metadata about the models, providers, etc.
+- Inference store: collect of responses from the inference provider
+- Agents store: store agent configurations (sessions, turns, etc.)
+- Agents Responses store: store responses from the agents
+
+However, you can use PostgreSQL instead by running the `starter::run-with-postgres-store.yaml` configuration:
+
+```bash
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -e OPENAI_API_KEY=your_openai_key \
+  -e FIREWORKS_API_KEY=your_fireworks_key \
+  -e TOGETHER_API_KEY=your_together_key \
+  -e POSTGRES_HOST=your_postgres_host \
+  -e POSTGRES_PORT=your_postgres_port \
+  -e POSTGRES_DB=your_postgres_db \
+  -e POSTGRES_USER=your_postgres_user \
+  -e POSTGRES_PASSWORD=your_postgres_password \
+  llamastack/distribution-starter \
+  starter::run-with-postgres-store.yaml
+```
+
+Postgres environment variables:
+
+- `POSTGRES_HOST`: Postgres host (default: `localhost`)
+- `POSTGRES_PORT`: Postgres port (default: `5432`)
+- `POSTGRES_DB`: Postgres database name (default: `llamastack`)
+- `POSTGRES_USER`: Postgres username (default: `llamastack`)
+- `POSTGRES_PASSWORD`: Postgres password (default: `llamastack`)
+
+### Via Conda or venv

 Ensure you have configured the starter distribution using the environment variables explained above.

@ -171,8 +205,11 @@ Ensure you have configured the starter distribution using the environment variab
 # Install dependencies for the starter distribution
 uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install

-# Run the server
+# Run the server (with SQLite - default)
 uv run --with llama-stack llama stack run starter
+
+# Or run with PostgreSQL
+uv run --with llama-stack llama stack run starter::run-with-postgres-store.yaml
 ```

 ## Example Usage
--- a/docs/docs/providers/inference/remote_bedrock.mdx
+++ b/docs/docs/providers/inference/remote_bedrock.mdx
@ -1,5 +1,5 @@
 ---
-description: "AWS Bedrock inference provider for accessing various AI models through AWS's managed service."
+description: "AWS Bedrock inference provider using OpenAI compatible endpoint."
 sidebar_label: Remote - Bedrock
 title: remote::bedrock
 ---
@ -8,7 +8,7 @@ title: remote::bedrock

 ## Description

-AWS Bedrock inference provider for accessing various AI models through AWS's managed service.
+AWS Bedrock inference provider using OpenAI compatible endpoint.

 ## Configuration

@ -16,19 +16,12 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `aws_access_key_id` | `str \| None` | No |  | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
-| `aws_secret_access_key` | `str \| None` | No |  | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
-| `aws_session_token` | `str \| None` | No |  | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
-| `region_name` | `str \| None` | No |  | The default AWS Region to use, for example, us-west-1 or us-west-2.Default use environment variable: AWS_DEFAULT_REGION |
-| `profile_name` | `str \| None` | No |  | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
-| `total_max_attempts` | `int \| None` | No |  | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
-| `retry_mode` | `str \| None` | No |  | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
-| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
-| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
-| `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
+| `region_name` | `<class 'str'>` | No | us-east-2 | AWS Region for the Bedrock Runtime endpoint |

 ## Sample Configuration

 ```yaml
-{}
+api_key: ${env.AWS_BEDROCK_API_KEY:=}
+region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
 ```
--- a/docs/docs/providers/inference/remote_passthrough.mdx
+++ b/docs/docs/providers/inference/remote_passthrough.mdx
@ -16,7 +16,7 @@ Passthrough inference provider for connecting to any external inference service
 |-------|------|----------|---------|-------------|
 | `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
-| `api_key` | `pydantic.types.SecretStr \| None` | No |  | API Key for the passthrouth endpoint |
+| `api_key` | `pydantic.types.SecretStr \| None` | No |  | Authentication credential for the provider |
 | `url` | `<class 'str'>` | No |  | The URL for the passthrough endpoint |

 ## Sample Configuration
--- a/docs/docs/providers/openai_responses_limitations.mdx
+++ b/docs/docs/providers/openai_responses_limitations.mdx
@ -48,11 +48,9 @@ Both OpenAI and Llama Stack support a web-search built-in tool.  The [OpenAI doc

 > The type of the web search tool. One of `web_search` or `web_search_2025_08_26`.

-In contrast, the [Llama Stack documentation](https://llamastack.github.io/docs/api/create-a-new-open-ai-response) says that the allowed values for `type` for web search are `MOD1`, `MOD2` and `MOD3`.
-Is that correct?  If so, what are the meanings of each of them?  It might make sense for the allowed values for OpenAI map to some values for Llama Stack so that code written to the OpenAI specification
-also work with Llama Stack.
+Llama Stack now supports both `web_search` and `web_search_2025_08_26` types, matching OpenAI's API. For backward compatibility, Llama Stack also supports `web_search_preview` and `web_search_preview_2025_03_11` types.

-The OpenAI web search tool also has fields for `filters` and `user_location` which are not documented as options for Llama Stack.  If feasible, it would be good to support these too.
+The OpenAI web search tool also has fields for `filters` and `user_location` which are not yet implemented in Llama Stack.  If feasible, it would be good to support these too.

 ---

--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
@ -37,7 +37,7 @@
   "outputs": [],
   "source": [
    "# NBVAL_SKIP\n",
-    "!pip install -U llama-stack\n",
+    "!pip install -U llama-stack llama-stack-client\n",
    "llama stack list-deps fireworks | xargs -L1 uv pip install\n"
   ]
  },
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
@ -44,7 +44,7 @@
   "outputs": [],
   "source": [
    "# NBVAL_SKIP\n",
-    "!pip install -U llama-stack"
+    "!pip install -U llama-stack llama-stack-client\n"
   ]
  },
  {
--- a/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
+++ b/docs/notebooks/nvidia/beginner_e2e/Llama_Stack_NVIDIA_E2E_Flow.ipynb
@ -74,6 +74,7 @@
   "source": [
    "```bash\n",
    "uv sync --extra dev\n",
+    "uv pip install -U llama-stack-client\n",
    "uv pip install -e .\n",
    "source .venv/bin/activate\n",
    "```"
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@ -170,7 +170,7 @@ def _get_endpoint_functions(
        for webmethod in webmethods:
            print(f"Processing {colored(func_name, 'white')}...")
            operation_name = func_name
-            
+
            if webmethod.method == "GET":
                prefix = "get"
            elif webmethod.method == "DELETE":
@ -196,16 +196,10 @@ def _get_endpoint_functions(
 def _get_defining_class(member_fn: str, derived_cls: type) -> type:
    "Find the class in which a member function is first defined in a class inheritance hierarchy."

-    # This import must be dynamic here
-    from llama_stack.apis.tools import RAGToolRuntime, ToolRuntime
-
    # iterate in reverse member resolution order to find most specific class first
    for cls in reversed(inspect.getmro(derived_cls)):
        for name, _ in inspect.getmembers(cls, inspect.isfunction):
            if name == member_fn:
-                # HACK ALERT
-                if cls == RAGToolRuntime:
-                    return ToolRuntime
                return cls

    raise ValidationError(
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@ -2052,69 +2052,6 @@ paths:
          schema:
            $ref: '#/components/schemas/URL'
      deprecated: false
-  /v1/tool-runtime/rag-tool/insert:
-    post:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Index documents so they can be used by the RAG system.
-      description: >-
-        Index documents so they can be used by the RAG system.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/InsertRequest'
-        required: true
-      deprecated: false
-  /v1/tool-runtime/rag-tool/query:
-    post:
-      responses:
-        '200':
-          description: >-
-            RAGQueryResult containing the retrieved content and metadata
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RAGQueryResult'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Query the RAG system for context; typically invoked by the agent.
-      description: >-
-        Query the RAG system for context; typically invoked by the agent.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/QueryRequest'
-        required: true
-      deprecated: false
  /v1/toolgroups:
    get:
      responses:
@ -6138,6 +6075,8 @@ components:
              const: web_search_preview
            - type: string
              const: web_search_preview_2025_03_11
+            - type: string
+              const: web_search_2025_08_26
          default: web_search
          description: Web search tool type variant to use
        search_context_size:
@ -8917,274 +8856,6 @@ components:
      title: ListToolDefsResponse
      description: >-
        Response containing a list of tool definitions.
-    RAGDocument:
-      type: object
-      properties:
-        document_id:
-          type: string
-          description: The unique identifier for the document.
-        content:
-          oneOf:
-            - type: string
-            - $ref: '#/components/schemas/InterleavedContentItem'
-            - type: array
-              items:
-                $ref: '#/components/schemas/InterleavedContentItem'
-            - $ref: '#/components/schemas/URL'
-          description: The content of the document.
-        mime_type:
-          type: string
-          description: The MIME type of the document.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Additional metadata for the document.
-      additionalProperties: false
-      required:
-        - document_id
-        - content
-        - metadata
-      title: RAGDocument
-      description: >-
-        A document to be used for document ingestion in the RAG Tool.
-    InsertRequest:
-      type: object
-      properties:
-        documents:
-          type: array
-          items:
-            $ref: '#/components/schemas/RAGDocument'
-          description: >-
-            List of documents to index in the RAG system
-        vector_store_id:
-          type: string
-          description: >-
-            ID of the vector database to store the document embeddings
-        chunk_size_in_tokens:
-          type: integer
-          description: >-
-            (Optional) Size in tokens for document chunking during indexing
-      additionalProperties: false
-      required:
-        - documents
-        - vector_store_id
-        - chunk_size_in_tokens
-      title: InsertRequest
-    DefaultRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: default
-          default: default
-          description: >-
-            Type of query generator, always 'default'
-        separator:
-          type: string
-          default: ' '
-          description: >-
-            String separator used to join query terms
-      additionalProperties: false
-      required:
-        - type
-        - separator
-      title: DefaultRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the default RAG query generator.
-    LLMRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm
-          default: llm
-          description: Type of query generator, always 'llm'
-        model:
-          type: string
-          description: >-
-            Name of the language model to use for query generation
-        template:
-          type: string
-          description: >-
-            Template string for formatting the query generation prompt
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - template
-      title: LLMRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the LLM-based RAG query generator.
-    RAGQueryConfig:
-      type: object
-      properties:
-        query_generator_config:
-          oneOf:
-            - $ref: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-            - $ref: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          discriminator:
-            propertyName: type
-            mapping:
-              default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-              llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          description: Configuration for the query generator.
-        max_tokens_in_context:
-          type: integer
-          default: 4096
-          description: Maximum number of tokens in the context.
-        max_chunks:
-          type: integer
-          default: 5
-          description: Maximum number of chunks to retrieve.
-        chunk_template:
-          type: string
-          default: >
-            Result {index}
-
-            Content: {chunk.content}
-
-            Metadata: {metadata}
-          description: >-
-            Template for formatting each retrieved chunk in the context. Available
-            placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk
-            content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
-            {chunk.content}\nMetadata: {metadata}\n"
-        mode:
-          $ref: '#/components/schemas/RAGSearchMode'
-          default: vector
-          description: >-
-            Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
-            "vector".
-        ranker:
-          $ref: '#/components/schemas/Ranker'
-          description: >-
-            Configuration for the ranker to use in hybrid search. Defaults to RRF
-            ranker.
-      additionalProperties: false
-      required:
-        - query_generator_config
-        - max_tokens_in_context
-        - max_chunks
-        - chunk_template
-      title: RAGQueryConfig
-      description: >-
-        Configuration for the RAG query generation.
-    RAGSearchMode:
-      type: string
-      enum:
-        - vector
-        - keyword
-        - hybrid
-      title: RAGSearchMode
-      description: >-
-        Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
-        for semantic matching - KEYWORD: Uses keyword-based search for exact matching
-        - HYBRID: Combines both vector and keyword search for better results
-    RRFRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: rrf
-          default: rrf
-          description: The type of ranker, always "rrf"
-        impact_factor:
-          type: number
-          default: 60.0
-          description: >-
-            The impact factor for RRF scoring. Higher values give more weight to higher-ranked
-            results. Must be greater than 0
-      additionalProperties: false
-      required:
-        - type
-        - impact_factor
-      title: RRFRanker
-      description: >-
-        Reciprocal Rank Fusion (RRF) ranker configuration.
-    Ranker:
-      oneOf:
-        - $ref: '#/components/schemas/RRFRanker'
-        - $ref: '#/components/schemas/WeightedRanker'
-      discriminator:
-        propertyName: type
-        mapping:
-          rrf: '#/components/schemas/RRFRanker'
-          weighted: '#/components/schemas/WeightedRanker'
-    WeightedRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: weighted
-          default: weighted
-          description: The type of ranker, always "weighted"
-        alpha:
-          type: number
-          default: 0.5
-          description: >-
-            Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
-            only use vector scores, values in between blend both scores.
-      additionalProperties: false
-      required:
-        - type
-        - alpha
-      title: WeightedRanker
-      description: >-
-        Weighted ranker configuration that combines vector and keyword scores.
-    QueryRequest:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The query content to search for in the indexed documents
-        vector_store_ids:
-          type: array
-          items:
-            type: string
-          description: >-
-            List of vector database IDs to search within
-        query_config:
-          $ref: '#/components/schemas/RAGQueryConfig'
-          description: >-
-            (Optional) Configuration parameters for the query operation
-      additionalProperties: false
-      required:
-        - content
-        - vector_store_ids
-      title: QueryRequest
-    RAGQueryResult:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            (Optional) The retrieved content from the query
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            Additional metadata about the query result
-      additionalProperties: false
-      required:
-        - metadata
-      title: RAGQueryResult
-      description: >-
-        Result of a RAG query containing retrieved content and metadata.
    ToolGroup:
      type: object
      properties:
@ -9591,6 +9262,70 @@ components:
        - metadata
      title: VectorStoreObject
      description: OpenAI Vector Store object.
+    VectorStoreChunkingStrategy:
+      oneOf:
+        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+      discriminator:
+        propertyName: type
+        mapping:
+          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+    VectorStoreChunkingStrategyAuto:
+      type: object
+      properties:
+        type:
+          type: string
+          const: auto
+          default: auto
+          description: >-
+            Strategy type, always "auto" for automatic chunking
+      additionalProperties: false
+      required:
+        - type
+      title: VectorStoreChunkingStrategyAuto
+      description: >-
+        Automatic chunking strategy for vector store files.
+    VectorStoreChunkingStrategyStatic:
+      type: object
+      properties:
+        type:
+          type: string
+          const: static
+          default: static
+          description: >-
+            Strategy type, always "static" for static chunking
+        static:
+          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
+          description: >-
+            Configuration parameters for the static chunking strategy
+      additionalProperties: false
+      required:
+        - type
+        - static
+      title: VectorStoreChunkingStrategyStatic
+      description: >-
+        Static chunking strategy with configurable parameters.
+    VectorStoreChunkingStrategyStaticConfig:
+      type: object
+      properties:
+        chunk_overlap_tokens:
+          type: integer
+          default: 400
+          description: >-
+            Number of tokens to overlap between adjacent chunks
+        max_chunk_size_tokens:
+          type: integer
+          default: 800
+          description: >-
+            Maximum number of tokens per chunk, must be between 100 and 4096
+      additionalProperties: false
+      required:
+        - chunk_overlap_tokens
+        - max_chunk_size_tokens
+      title: VectorStoreChunkingStrategyStaticConfig
+      description: >-
+        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreRequestWithExtraBody":
      type: object
      properties:
@ -9616,15 +9351,7 @@ components:
          description: >-
            (Optional) Expiration policy for the vector store
        chunking_strategy:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
          description: >-
            (Optional) Strategy for splitting files into chunks
        metadata:
@ -9700,70 +9427,6 @@ components:
        - deleted
      title: VectorStoreDeleteResponse
      description: Response from deleting a vector store.
-    VectorStoreChunkingStrategy:
-      oneOf:
-        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
-        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
-      discriminator:
-        propertyName: type
-        mapping:
-          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
-          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
-    VectorStoreChunkingStrategyAuto:
-      type: object
-      properties:
-        type:
-          type: string
-          const: auto
-          default: auto
-          description: >-
-            Strategy type, always "auto" for automatic chunking
-      additionalProperties: false
-      required:
-        - type
-      title: VectorStoreChunkingStrategyAuto
-      description: >-
-        Automatic chunking strategy for vector store files.
-    VectorStoreChunkingStrategyStatic:
-      type: object
-      properties:
-        type:
-          type: string
-          const: static
-          default: static
-          description: >-
-            Strategy type, always "static" for static chunking
-        static:
-          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
-          description: >-
-            Configuration parameters for the static chunking strategy
-      additionalProperties: false
-      required:
-        - type
-        - static
-      title: VectorStoreChunkingStrategyStatic
-      description: >-
-        Static chunking strategy with configurable parameters.
-    VectorStoreChunkingStrategyStaticConfig:
-      type: object
-      properties:
-        chunk_overlap_tokens:
-          type: integer
-          default: 400
-          description: >-
-            Number of tokens to overlap between adjacent chunks
-        max_chunk_size_tokens:
-          type: integer
-          default: 800
-          description: >-
-            Maximum number of tokens per chunk, must be between 100 and 4096
-      additionalProperties: false
-      required:
-        - chunk_overlap_tokens
-        - max_chunk_size_tokens
-      title: VectorStoreChunkingStrategyStaticConfig
-      description: >-
-        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreFileBatchRequestWithExtraBody":
      type: object
      properties:
@ -10221,7 +9884,9 @@ components:
          description: >-
            Object type identifier for the search results page
        search_query:
-          type: string
+          type: array
+          items:
+            type: string
          description: >-
            The original search query that was executed
        data:
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@ -2055,69 +2055,6 @@ paths:
          schema:
            $ref: '#/components/schemas/URL'
      deprecated: false
-  /v1/tool-runtime/rag-tool/insert:
-    post:
-      responses:
-        '200':
-          description: OK
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Index documents so they can be used by the RAG system.
-      description: >-
-        Index documents so they can be used by the RAG system.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/InsertRequest'
-        required: true
-      deprecated: false
-  /v1/tool-runtime/rag-tool/query:
-    post:
-      responses:
-        '200':
-          description: >-
-            RAGQueryResult containing the retrieved content and metadata
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RAGQueryResult'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - ToolRuntime
-      summary: >-
-        Query the RAG system for context; typically invoked by the agent.
-      description: >-
-        Query the RAG system for context; typically invoked by the agent.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/QueryRequest'
-        required: true
-      deprecated: false
  /v1/toolgroups:
    get:
      responses:
@ -6854,6 +6791,8 @@ components:
              const: web_search_preview
            - type: string
              const: web_search_preview_2025_03_11
+            - type: string
+              const: web_search_2025_08_26
          default: web_search
          description: Web search tool type variant to use
        search_context_size:
@ -9633,274 +9572,6 @@ components:
      title: ListToolDefsResponse
      description: >-
        Response containing a list of tool definitions.
-    RAGDocument:
-      type: object
-      properties:
-        document_id:
-          type: string
-          description: The unique identifier for the document.
-        content:
-          oneOf:
-            - type: string
-            - $ref: '#/components/schemas/InterleavedContentItem'
-            - type: array
-              items:
-                $ref: '#/components/schemas/InterleavedContentItem'
-            - $ref: '#/components/schemas/URL'
-          description: The content of the document.
-        mime_type:
-          type: string
-          description: The MIME type of the document.
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: Additional metadata for the document.
-      additionalProperties: false
-      required:
-        - document_id
-        - content
-        - metadata
-      title: RAGDocument
-      description: >-
-        A document to be used for document ingestion in the RAG Tool.
-    InsertRequest:
-      type: object
-      properties:
-        documents:
-          type: array
-          items:
-            $ref: '#/components/schemas/RAGDocument'
-          description: >-
-            List of documents to index in the RAG system
-        vector_store_id:
-          type: string
-          description: >-
-            ID of the vector database to store the document embeddings
-        chunk_size_in_tokens:
-          type: integer
-          description: >-
-            (Optional) Size in tokens for document chunking during indexing
-      additionalProperties: false
-      required:
-        - documents
-        - vector_store_id
-        - chunk_size_in_tokens
-      title: InsertRequest
-    DefaultRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: default
-          default: default
-          description: >-
-            Type of query generator, always 'default'
-        separator:
-          type: string
-          default: ' '
-          description: >-
-            String separator used to join query terms
-      additionalProperties: false
-      required:
-        - type
-        - separator
-      title: DefaultRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the default RAG query generator.
-    LLMRAGQueryGeneratorConfig:
-      type: object
-      properties:
-        type:
-          type: string
-          const: llm
-          default: llm
-          description: Type of query generator, always 'llm'
-        model:
-          type: string
-          description: >-
-            Name of the language model to use for query generation
-        template:
-          type: string
-          description: >-
-            Template string for formatting the query generation prompt
-      additionalProperties: false
-      required:
-        - type
-        - model
-        - template
-      title: LLMRAGQueryGeneratorConfig
-      description: >-
-        Configuration for the LLM-based RAG query generator.
-    RAGQueryConfig:
-      type: object
-      properties:
-        query_generator_config:
-          oneOf:
-            - $ref: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-            - $ref: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          discriminator:
-            propertyName: type
-            mapping:
-              default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
-              llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
-          description: Configuration for the query generator.
-        max_tokens_in_context:
-          type: integer
-          default: 4096
-          description: Maximum number of tokens in the context.
-        max_chunks:
-          type: integer
-          default: 5
-          description: Maximum number of chunks to retrieve.
-        chunk_template:
-          type: string
-          default: >
-            Result {index}
-
-            Content: {chunk.content}
-
-            Metadata: {metadata}
-          description: >-
-            Template for formatting each retrieved chunk in the context. Available
-            placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk
-            content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
-            {chunk.content}\nMetadata: {metadata}\n"
-        mode:
-          $ref: '#/components/schemas/RAGSearchMode'
-          default: vector
-          description: >-
-            Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
-            "vector".
-        ranker:
-          $ref: '#/components/schemas/Ranker'
-          description: >-
-            Configuration for the ranker to use in hybrid search. Defaults to RRF
-            ranker.
-      additionalProperties: false
-      required:
-        - query_generator_config
-        - max_tokens_in_context
-        - max_chunks
-        - chunk_template
-      title: RAGQueryConfig
-      description: >-
-        Configuration for the RAG query generation.
-    RAGSearchMode:
-      type: string
-      enum:
-        - vector
-        - keyword
-        - hybrid
-      title: RAGSearchMode
-      description: >-
-        Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
-        for semantic matching - KEYWORD: Uses keyword-based search for exact matching
-        - HYBRID: Combines both vector and keyword search for better results
-    RRFRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: rrf
-          default: rrf
-          description: The type of ranker, always "rrf"
-        impact_factor:
-          type: number
-          default: 60.0
-          description: >-
-            The impact factor for RRF scoring. Higher values give more weight to higher-ranked
-            results. Must be greater than 0
-      additionalProperties: false
-      required:
-        - type
-        - impact_factor
-      title: RRFRanker
-      description: >-
-        Reciprocal Rank Fusion (RRF) ranker configuration.
-    Ranker:
-      oneOf:
-        - $ref: '#/components/schemas/RRFRanker'
-        - $ref: '#/components/schemas/WeightedRanker'
-      discriminator:
-        propertyName: type
-        mapping:
-          rrf: '#/components/schemas/RRFRanker'
-          weighted: '#/components/schemas/WeightedRanker'
-    WeightedRanker:
-      type: object
-      properties:
-        type:
-          type: string
-          const: weighted
-          default: weighted
-          description: The type of ranker, always "weighted"
-        alpha:
-          type: number
-          default: 0.5
-          description: >-
-            Weight factor between 0 and 1. 0 means only use keyword scores, 1 means
-            only use vector scores, values in between blend both scores.
-      additionalProperties: false
-      required:
-        - type
-        - alpha
-      title: WeightedRanker
-      description: >-
-        Weighted ranker configuration that combines vector and keyword scores.
-    QueryRequest:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The query content to search for in the indexed documents
-        vector_store_ids:
-          type: array
-          items:
-            type: string
-          description: >-
-            List of vector database IDs to search within
-        query_config:
-          $ref: '#/components/schemas/RAGQueryConfig'
-          description: >-
-            (Optional) Configuration parameters for the query operation
-      additionalProperties: false
-      required:
-        - content
-        - vector_store_ids
-      title: QueryRequest
-    RAGQueryResult:
-      type: object
-      properties:
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            (Optional) The retrieved content from the query
-        metadata:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
-          description: >-
-            Additional metadata about the query result
-      additionalProperties: false
-      required:
-        - metadata
-      title: RAGQueryResult
-      description: >-
-        Result of a RAG query containing retrieved content and metadata.
    ToolGroup:
      type: object
      properties:
@ -10307,6 +9978,70 @@ components:
        - metadata
      title: VectorStoreObject
      description: OpenAI Vector Store object.
+    VectorStoreChunkingStrategy:
+      oneOf:
+        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+      discriminator:
+        propertyName: type
+        mapping:
+          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
+          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
+    VectorStoreChunkingStrategyAuto:
+      type: object
+      properties:
+        type:
+          type: string
+          const: auto
+          default: auto
+          description: >-
+            Strategy type, always "auto" for automatic chunking
+      additionalProperties: false
+      required:
+        - type
+      title: VectorStoreChunkingStrategyAuto
+      description: >-
+        Automatic chunking strategy for vector store files.
+    VectorStoreChunkingStrategyStatic:
+      type: object
+      properties:
+        type:
+          type: string
+          const: static
+          default: static
+          description: >-
+            Strategy type, always "static" for static chunking
+        static:
+          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
+          description: >-
+            Configuration parameters for the static chunking strategy
+      additionalProperties: false
+      required:
+        - type
+        - static
+      title: VectorStoreChunkingStrategyStatic
+      description: >-
+        Static chunking strategy with configurable parameters.
+    VectorStoreChunkingStrategyStaticConfig:
+      type: object
+      properties:
+        chunk_overlap_tokens:
+          type: integer
+          default: 400
+          description: >-
+            Number of tokens to overlap between adjacent chunks
+        max_chunk_size_tokens:
+          type: integer
+          default: 800
+          description: >-
+            Maximum number of tokens per chunk, must be between 100 and 4096
+      additionalProperties: false
+      required:
+        - chunk_overlap_tokens
+        - max_chunk_size_tokens
+      title: VectorStoreChunkingStrategyStaticConfig
+      description: >-
+        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreRequestWithExtraBody":
      type: object
      properties:
@ -10332,15 +10067,7 @@ components:
          description: >-
            (Optional) Expiration policy for the vector store
        chunking_strategy:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
+          $ref: '#/components/schemas/VectorStoreChunkingStrategy'
          description: >-
            (Optional) Strategy for splitting files into chunks
        metadata:
@ -10416,70 +10143,6 @@ components:
        - deleted
      title: VectorStoreDeleteResponse
      description: Response from deleting a vector store.
-    VectorStoreChunkingStrategy:
-      oneOf:
-        - $ref: '#/components/schemas/VectorStoreChunkingStrategyAuto'
-        - $ref: '#/components/schemas/VectorStoreChunkingStrategyStatic'
-      discriminator:
-        propertyName: type
-        mapping:
-          auto: '#/components/schemas/VectorStoreChunkingStrategyAuto'
-          static: '#/components/schemas/VectorStoreChunkingStrategyStatic'
-    VectorStoreChunkingStrategyAuto:
-      type: object
-      properties:
-        type:
-          type: string
-          const: auto
-          default: auto
-          description: >-
-            Strategy type, always "auto" for automatic chunking
-      additionalProperties: false
-      required:
-        - type
-      title: VectorStoreChunkingStrategyAuto
-      description: >-
-        Automatic chunking strategy for vector store files.
-    VectorStoreChunkingStrategyStatic:
-      type: object
-      properties:
-        type:
-          type: string
-          const: static
-          default: static
-          description: >-
-            Strategy type, always "static" for static chunking
-        static:
-          $ref: '#/components/schemas/VectorStoreChunkingStrategyStaticConfig'
-          description: >-
-            Configuration parameters for the static chunking strategy
-      additionalProperties: false
-      required:
-        - type
-        - static
-      title: VectorStoreChunkingStrategyStatic
-      description: >-
-        Static chunking strategy with configurable parameters.
-    VectorStoreChunkingStrategyStaticConfig:
-      type: object
-      properties:
-        chunk_overlap_tokens:
-          type: integer
-          default: 400
-          description: >-
-            Number of tokens to overlap between adjacent chunks
-        max_chunk_size_tokens:
-          type: integer
-          default: 800
-          description: >-
-            Maximum number of tokens per chunk, must be between 100 and 4096
-      additionalProperties: false
-      required:
-        - chunk_overlap_tokens
-        - max_chunk_size_tokens
-      title: VectorStoreChunkingStrategyStaticConfig
-      description: >-
-        Configuration for static chunking strategy.
    "OpenAICreateVectorStoreFileBatchRequestWithExtraBody":
      type: object
      properties:
@ -10937,7 +10600,9 @@ components:
          description: >-
            Object type identifier for the search results page
        search_query:
-          type: string
+          type: array
+          items:
+            type: string
          description: >-
            The original search query that was executed
        data:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -24,13 +24,13 @@ classifiers = [
    "Topic :: Scientific/Engineering :: Information Analysis",
 ]
 dependencies = [
+    "PyYAML>=6.0",
    "aiohttp",
    "fastapi>=0.115.0,<1.0",                          # server
    "fire",                                           # for MCP in LLS client
    "httpx",
    "jinja2>=3.1.6",
    "jsonschema",
-    "llama-stack-client>=0.3.0",
    "openai>=2.5.0",
    "prompt-toolkit",
    "python-dotenv",
@ -52,11 +52,8 @@ dependencies = [
 ]

 [project.optional-dependencies]
-ui = [
-    "streamlit",
-    "pandas",
-    "llama-stack-client>=0.3.0",
-    "streamlit-option-menu",
+client = [
+    "llama-stack-client>=0.3.0",  # Optional for library-only usage
 ]

 [dependency-groups]
@ -104,6 +101,7 @@ type_checking = [
    "lm-format-enforcer",
    "mcp",
    "ollama",
+    "llama-stack-client>=0.3.0",
 ]
 # These are the dependencies required for running unit tests.
 unit = [
--- a/scripts/cleanup_recordings.py
+++ b/scripts/cleanup_recordings.py
@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Clean up unused test recordings based on CI test collection.
+
+This script:
+1. Reads CI matrix definitions from tests/integration/ci_matrix.json (default + scheduled overrides)
+2. Uses pytest --collect-only with --json-report to gather all test IDs that run in CI
+3. Compares against existing recordings to identify unused ones
+4. Optionally deletes unused recordings
+
+Usage:
+    # Dry run - see what would be deleted
+    ./scripts/cleanup_recordings.py
+
+    # Save manifest of CI test IDs for inspection
+    ./scripts/cleanup_recordings.py --manifest ci_tests.txt
+
+    # Actually delete unused recordings
+    ./scripts/cleanup_recordings.py --delete
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import tempfile
+from collections import defaultdict
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).parent.parent
+
+# Load CI matrix from JSON file
+CI_MATRIX_FILE = REPO_ROOT / "tests/integration/ci_matrix.json"
+with open(CI_MATRIX_FILE) as f:
+    _matrix_config = json.load(f)
+
+DEFAULT_CI_MATRIX: list[dict[str, str]] = _matrix_config["default"]
+SCHEDULED_MATRICES: dict[str, list[dict[str, str]]] = _matrix_config.get("schedules", {})
+
+
+def _unique_configs(entries):
+    seen: set[tuple[str, str]] = set()
+    for entry in entries:
+        suite = entry["suite"]
+        setup = entry["setup"]
+        key = (suite, setup)
+        if key in seen:
+            continue
+        seen.add(key)
+        yield {"suite": suite, "setup": setup}
+
+
+def iter_all_ci_configs() -> list[dict[str, str]]:
+    """Return unique CI configs across default and scheduled matrices."""
+    combined = list(DEFAULT_CI_MATRIX)
+    for configs in SCHEDULED_MATRICES.values():
+        combined.extend(configs)
+    return list(_unique_configs(combined))
+
+
+def collect_ci_tests():
+    """Collect all test IDs that would run in CI using --collect-only with JSON output."""
+
+    all_test_ids = set()
+    configs = iter_all_ci_configs()
+
+    for config in configs:
+        print(f"Collecting tests for suite={config['suite']}, setup={config['setup']}...")
+
+        # Create a temporary file for JSON report
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json_report_file = f.name
+
+        try:
+            # Configure environment for collection run
+            env = os.environ.copy()
+            env["PYTEST_ADDOPTS"] = f"--json-report --json-report-file={json_report_file}"
+            repo_path = str(REPO_ROOT)
+            existing_path = env.get("PYTHONPATH", "")
+            env["PYTHONPATH"] = f"{repo_path}{os.pathsep}{existing_path}" if existing_path else repo_path
+
+            result = subprocess.run(
+                [
+                    "./scripts/integration-tests.sh",
+                    "--collect-only",
+                    "--suite",
+                    config["suite"],
+                    "--setup",
+                    config["setup"],
+                ],
+                capture_output=True,
+                text=True,
+                cwd=REPO_ROOT,
+                env=env,
+            )
+
+            if result.returncode != 0:
+                raise RuntimeError(
+                    "Test collection failed.\n"
+                    f"Command: {' '.join(result.args)}\n"
+                    f"stdout:\n{result.stdout}\n"
+                    f"stderr:\n{result.stderr}"
+                )
+
+            # Parse JSON report to extract test IDs
+            try:
+                with open(json_report_file) as f:
+                    report = json.load(f)
+
+                # The "collectors" field contains collected test items
+                # Each collector has a "result" array with test node IDs
+                for collector in report.get("collectors", []):
+                    for item in collector.get("result", []):
+                        # The "nodeid" field is the test ID
+                        if "nodeid" in item:
+                            all_test_ids.add(item["nodeid"])
+
+                print(f"  Collected {len(all_test_ids)} test IDs so far")
+
+            except (json.JSONDecodeError, FileNotFoundError) as e:
+                print(f"  Warning: Failed to parse JSON report: {e}")
+                continue
+
+        finally:
+            # Clean up temp file
+            if os.path.exists(json_report_file):
+                os.unlink(json_report_file)
+
+    print(f"\nTotal unique test IDs collected: {len(all_test_ids)}")
+    return all_test_ids, configs
+
+
+def get_base_test_id(test_id: str) -> str:
+    """Extract base test ID without parameterization.
+
+    Example:
+      'tests/integration/inference/test_foo.py::test_bar[param1-param2]'
+      -> 'tests/integration/inference/test_foo.py::test_bar'
+    """
+    return test_id.split("[")[0] if "[" in test_id else test_id
+
+
+def find_all_recordings():
+    """Find all recording JSON files."""
+    return list((REPO_ROOT / "tests/integration").rglob("recordings/*.json"))
+
+
+def analyze_recordings(ci_test_ids, dry_run=True):
+    """Analyze recordings and identify unused ones."""
+
+    # Use full test IDs with parameterization for exact matching
+    all_recordings = find_all_recordings()
+    print(f"\nTotal recording files: {len(all_recordings)}")
+
+    # Categorize recordings
+    used_recordings = []
+    unused_recordings = []
+    shared_recordings = []  # model-list endpoints without test_id
+    parse_errors = []
+
+    for json_file in all_recordings:
+        try:
+            with open(json_file) as f:
+                data = json.load(f)
+
+            test_id = data.get("test_id", "")
+
+            if not test_id:
+                # Shared/infrastructure recordings (model lists, etc)
+                shared_recordings.append(json_file)
+                continue
+
+            # Match exact test_id (with full parameterization)
+            if test_id in ci_test_ids:
+                used_recordings.append(json_file)
+            else:
+                unused_recordings.append((json_file, test_id))
+
+        except Exception as e:
+            parse_errors.append((json_file, str(e)))
+
+    # Print summary
+    print("\nRecording Analysis:")
+    print(f"  Used in CI:     {len(used_recordings)}")
+    print(f"  Shared (no ID): {len(shared_recordings)}")
+    print(f"  UNUSED:         {len(unused_recordings)}")
+    print(f"  Parse errors:   {len(parse_errors)}")
+
+    if unused_recordings:
+        print("\nUnused recordings by test:")
+
+        # Group by base test ID
+        by_test = defaultdict(list)
+        for file, test_id in unused_recordings:
+            base = get_base_test_id(test_id)
+            by_test[base].append(file)
+
+        for base_test, files in sorted(by_test.items()):
+            print(f"\n  {base_test}")
+            print(f"    ({len(files)} recording(s))")
+            for f in files[:3]:
+                print(f"      - {f.relative_to(REPO_ROOT / 'tests/integration')}")
+            if len(files) > 3:
+                print(f"      ... and {len(files) - 3} more")
+
+    if parse_errors:
+        print("\nParse errors:")
+        for file, error in parse_errors[:5]:
+            print(f"  {file.relative_to(REPO_ROOT)}: {error}")
+        if len(parse_errors) > 5:
+            print(f"  ... and {len(parse_errors) - 5} more")
+
+    # Perform cleanup
+    if not dry_run:
+        print(f"\nDeleting {len(unused_recordings)} unused recordings...")
+        for file, _ in unused_recordings:
+            file.unlink()
+            print(f"  Deleted: {file.relative_to(REPO_ROOT / 'tests/integration')}")
+        print("✅ Cleanup complete")
+    else:
+        print("\n(Dry run - no files deleted)")
+        print("\nTo delete these files, run with --delete")
+
+    return len(unused_recordings)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Clean up unused test recordings based on CI test collection",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument("--delete", action="store_true", help="Actually delete unused recordings (default is dry-run)")
+    parser.add_argument("--manifest", help="Save collected test IDs to file (optional)")
+
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("Recording Cleanup Utility")
+    print("=" * 60)
+
+    ci_configs = iter_all_ci_configs()
+
+    print(f"\nDetected CI configurations: {len(ci_configs)}")
+    for config in ci_configs:
+        print(f"  - suite={config['suite']}, setup={config['setup']}")
+
+    # Collect test IDs from CI configurations
+    ci_test_ids, _ = collect_ci_tests()
+
+    if args.manifest:
+        with open(args.manifest, "w") as f:
+            for test_id in sorted(ci_test_ids):
+                f.write(f"{test_id}\n")
+        print(f"\nSaved test IDs to: {args.manifest}")
+
+    # Analyze and cleanup
+    unused_count = analyze_recordings(ci_test_ids, dry_run=not args.delete)
+
+    print("\n" + "=" * 60)
+    if unused_count > 0 and not args.delete:
+        print("Run with --delete to remove unused recordings")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/generate_ci_matrix.py
+++ b/scripts/generate_ci_matrix.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+"""
+Generate CI test matrix from ci_matrix.json with schedule/input overrides.
+
+This script is used by .github/workflows/integration-tests.yml to generate
+the test matrix dynamically based on the CI_MATRIX definition.
+"""
+
+import json
+from pathlib import Path
+
+CI_MATRIX_FILE = Path(__file__).parent.parent / "tests/integration/ci_matrix.json"
+
+with open(CI_MATRIX_FILE) as f:
+    matrix_config = json.load(f)
+
+DEFAULT_MATRIX = matrix_config["default"]
+SCHEDULE_MATRICES: dict[str, list[dict[str, str]]] = matrix_config.get("schedules", {})
+
+
+def generate_matrix(schedule="", test_setup=""):
+    """
+    Generate test matrix based on schedule or manual input.
+
+    Args:
+        schedule: GitHub cron schedule string (e.g., "1 0 * * 0" for weekly)
+        test_setup: Manual test setup input (e.g., "ollama-vision")
+
+    Returns:
+        Matrix configuration as JSON string
+    """
+    # Weekly scheduled test matrices
+    if schedule and schedule in SCHEDULE_MATRICES:
+        matrix = SCHEDULE_MATRICES[schedule]
+    # Manual input for specific setup
+    elif test_setup == "ollama-vision":
+        matrix = [{"suite": "vision", "setup": "ollama-vision"}]
+    # Default: use JSON-defined matrix
+    else:
+        matrix = DEFAULT_MATRIX
+
+    # GitHub Actions expects {"include": [...]} format
+    return json.dumps({"include": matrix})
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Generate CI test matrix")
+    parser.add_argument("--schedule", default="", help="GitHub schedule cron string")
+    parser.add_argument("--test-setup", default="", help="Manual test setup input")
+
+    args = parser.parse_args()
+
+    print(generate_matrix(args.schedule, args.test_setup))
--- a/scripts/integration-tests.sh
+++ b/scripts/integration-tests.sh
@ -227,14 +227,16 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
    echo "=== Starting Llama Stack Server ==="
    export LLAMA_STACK_LOG_WIDTH=120

-    # Configure telemetry collector for server mode
-    # Use a fixed port for the OTEL collector so the server can connect to it
-    COLLECTOR_PORT=4317
-    export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
-    export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
-    export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf"
-    export OTEL_BSP_SCHEDULE_DELAY="200"
-    export OTEL_BSP_EXPORT_TIMEOUT="2000"
+        # Configure telemetry collector for server mode
+        # Use a fixed port for the OTEL collector so the server can connect to it
+        COLLECTOR_PORT=4317
+        export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
+        # Disabled: https://github.com/llamastack/llama-stack/issues/4089
+        #export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
+        export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf"
+        export OTEL_BSP_SCHEDULE_DELAY="200"
+        export OTEL_BSP_EXPORT_TIMEOUT="2000"
+        export OTEL_METRIC_EXPORT_INTERVAL="200"

    # remove "server:" from STACK_CONFIG
    stack_config=$(echo "$STACK_CONFIG" | sed 's/^server://')
@ -336,7 +338,11 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    DOCKER_ENV_VARS=""
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
-    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
+    # Disabled: https://github.com/llamastack/llama-stack/issues/4089
+    #DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_METRIC_EXPORT_INTERVAL=200"
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_SCHEDULE_DELAY=200"
+    DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_EXPORT_TIMEOUT=2000"

    # Pass through API keys if they exist
    [ -n "${TOGETHER_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TOGETHER_API_KEY=$TOGETHER_API_KEY"
@ -349,6 +355,10 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
    [ -n "${OLLAMA_URL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"
    [ -n "${SAFETY_MODEL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e SAFETY_MODEL=$SAFETY_MODEL"

+    if [[ "$TEST_SETUP" == "vllm" ]]; then
+        DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e VLLM_URL=http://localhost:8000/v1"
+    fi
+
    # Determine the actual image name (may have localhost/ prefix)
    IMAGE_NAME=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "distribution-$DISTRO:dev$" | head -1)
    if [[ -z "$IMAGE_NAME" ]]; then
@ -401,11 +411,6 @@ fi
 echo "=== Running Integration Tests ==="
 EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"

-# Additional exclusions for vllm setup
-if [[ "$TEST_SETUP" == "vllm" ]]; then
-    EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
-fi
-
 PYTEST_PATTERN="not( $EXCLUDE_TESTS )"
 if [[ -n "$TEST_PATTERN" ]]; then
    PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"
--- a/scripts/run-ui-linter.sh
+++ b/scripts/run-ui-linter.sh
@ -6,7 +6,7 @@
 # the root directory of this source tree.

 set -e
-cd src/llama_stack/ui
+cd src/llama_stack_ui

 if [ ! -d node_modules ] || [ ! -x node_modules/.bin/prettier ] || [ ! -x node_modules/.bin/eslint ]; then
  echo "UI dependencies not installed, skipping prettier/linter check"
--- a/src/llama_stack/init.py
+++ b/src/llama_stack/init.py
@ -3,8 +3,3 @@
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-
-from llama_stack.core.library_client import (  # noqa: F401
-    AsyncLlamaStackAsLibraryClient,
-    LlamaStackAsLibraryClient,
-)
--- a/src/llama_stack/apis/agents/openai_responses.py
+++ b/src/llama_stack/apis/agents/openai_responses.py
@ -403,7 +403,7 @@ class OpenAIResponseText(BaseModel):


 # Must match type Literals of OpenAIResponseInputToolWebSearch below
-WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11"]
+WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11", "web_search_2025_08_26"]


@json_schema_type
@ -415,9 +415,12 @@ class OpenAIResponseInputToolWebSearch(BaseModel):
    """

    # Must match values of WebSearchToolTypes above
-    type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = (
-        "web_search"
-    )
+    type: (
+        Literal["web_search"]
+        | Literal["web_search_preview"]
+        | Literal["web_search_preview_2025_03_11"]
+        | Literal["web_search_2025_08_26"]
+    ) = "web_search"
    # TODO: actually use search_context_size somewhere...
    search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
    # TODO: add user_location
--- a/src/llama_stack/apis/common/responses.py
+++ b/src/llama_stack/apis/common/responses.py
@ -34,3 +34,44 @@ class PaginatedResponse(BaseModel):
    data: list[dict[str, Any]]
    has_more: bool
    url: str | None = None
+
+
+# This is a short term solution to allow inference API to return metrics
+# The ideal way to do this is to have a way for all response types to include metrics
+# and all metric events logged to the telemetry API to be included with the response
+# To do this, we will need to augment all response types with a metrics field.
+# We have hit a blocker from stainless SDK that prevents us from doing this.
+# The blocker is that if we were to augment the response types that have a data field
+# in them like so
+# class ListModelsResponse(BaseModel):
+# metrics: Optional[List[MetricEvent]] = None
+# data: List[Models]
+# ...
+# The client SDK will need to access the data by using a .data field, which is not
+# ergonomic. Stainless SDK does support unwrapping the response type, but it
+# requires that the response type to only have a single field.
+
+# We will need a way in the client SDK to signal that the metrics are needed
+# and if they are needed, the client SDK has to return the full response type
+# without unwrapping it.
+
+
+@json_schema_type
+class MetricInResponse(BaseModel):
+    """A metric value included in API responses.
+    :param metric: The name of the metric
+    :param value: The numeric value of the metric
+    :param unit: (Optional) The unit of measurement for the metric value
+    """
+
+    metric: str
+    value: int | float
+    unit: str | None = None
+
+
+class MetricResponseMixin(BaseModel):
+    """Mixin class for API responses that can include metrics.
+    :param metrics: (Optional) List of metrics associated with the API response
+    """
+
+    metrics: list[MetricInResponse] | None = None
--- a/src/llama_stack/apis/common/tracing.py
+++ b/src/llama_stack/apis/common/tracing.py
@ -0,0 +1,22 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+def telemetry_traceable(cls):
+    """
+    Mark a protocol for automatic tracing when telemetry is enabled.
+
+    This is a metadata-only decorator with no dependencies on core.
+    Actual tracing is applied by core routers at runtime if telemetry is enabled.
+
+    Usage:
+        @runtime_checkable
+        @telemetry_traceable
+        class MyProtocol(Protocol):
+            ...
+    """
+    cls.__marked_for_tracing__ = True
+    return cls
--- a/src/llama_stack/apis/conversations/init.py
+++ b/src/llama_stack/apis/conversations/init.py
@ -6,26 +6,22 @@

 from .conversations import (
    Conversation,
-    ConversationCreateRequest,
    ConversationDeletedResource,
    ConversationItem,
    ConversationItemCreateRequest,
    ConversationItemDeletedResource,
    ConversationItemList,
    Conversations,
-    ConversationUpdateRequest,
    Metadata,
 )

 __all__ = [
    "Conversation",
-    "ConversationCreateRequest",
    "ConversationDeletedResource",
    "ConversationItem",
    "ConversationItemCreateRequest",
    "ConversationItemDeletedResource",
    "ConversationItemList",
    "Conversations",
-    "ConversationUpdateRequest",
    "Metadata",
 ]
--- a/src/llama_stack/apis/conversations/conversations.py
+++ b/src/llama_stack/apis/conversations/conversations.py
@ -20,8 +20,8 @@ from llama_stack.apis.agents.openai_responses import (
    OpenAIResponseOutputMessageMCPListTools,
    OpenAIResponseOutputMessageWebSearchToolCall,
 )
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

 Metadata = dict[str, str]
@ -102,32 +102,6 @@ register_schema(ConversationItem, name="ConversationItem")
 # ]


-@json_schema_type
-class ConversationCreateRequest(BaseModel):
-    """Request body for creating a conversation."""
-
-    items: list[ConversationItem] | None = Field(
-        default=[],
-        description="Initial items to include in the conversation context. You may add up to 20 items at a time.",
-        max_length=20,
-    )
-    metadata: Metadata | None = Field(
-        default={},
-        description="Set of 16 key-value pairs that can be attached to an object. Useful for storing additional information",
-        max_length=16,
-    )
-
-
-@json_schema_type
-class ConversationUpdateRequest(BaseModel):
-    """Request body for updating a conversation."""
-
-    metadata: Metadata = Field(
-        ...,
-        description="Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters.",
-    )
-
-
@json_schema_type
 class ConversationDeletedResource(BaseModel):
    """Response for deleted conversation."""
@ -183,7 +157,7 @@ class ConversationItemDeletedResource(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Conversations(Protocol):
    """Conversations

--- a/src/llama_stack/apis/files/files.py
+++ b/src/llama_stack/apis/files/files.py
@ -11,8 +11,8 @@ from fastapi import File, Form, Response, UploadFile
 from pydantic import BaseModel, Field

 from llama_stack.apis.common.responses import Order
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -102,7 +102,7 @@ class OpenAIFileDeleteResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Files(Protocol):
    """Files

--- a/src/llama_stack/apis/inference/inference.py
+++ b/src/llama_stack/apis/inference/inference.py
@ -19,11 +19,10 @@ from pydantic import BaseModel, Field, field_validator
 from typing_extensions import TypedDict

 from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
-from llama_stack.apis.common.responses import Order
+from llama_stack.apis.common.responses import MetricResponseMixin, Order
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.models import Model
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
-from llama_stack.core.telemetry.telemetry import MetricResponseMixin
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.models.llama.datatypes import (
    BuiltinTool,
    StopReason,
@ -1160,7 +1159,7 @@ class OpenAIEmbeddingsRequestWithExtraBody(BaseModel, extra="allow"):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class InferenceProvider(Protocol):
    """
    This protocol defines the interface that should be implemented by all inference providers.
--- a/src/llama_stack/apis/models/models.py
+++ b/src/llama_stack/apis/models/models.py
@ -9,9 +9,9 @@ from typing import Any, Literal, Protocol, runtime_checkable

 from pydantic import BaseModel, ConfigDict, Field, field_validator

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -105,7 +105,7 @@ class OpenAIListModelsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Models(Protocol):
    async def list_models(self) -> ListModelsResponse:
        """List all models.
--- a/src/llama_stack/apis/prompts/prompts.py
+++ b/src/llama_stack/apis/prompts/prompts.py
@ -10,8 +10,8 @@ from typing import Protocol, runtime_checkable

 from pydantic import BaseModel, Field, field_validator, model_validator

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -92,7 +92,7 @@ class ListPromptsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Prompts(Protocol):
    """Prompts

--- a/src/llama_stack/apis/safety/safety.py
+++ b/src/llama_stack/apis/safety/safety.py
@ -9,10 +9,10 @@ from typing import Any, Protocol, runtime_checkable

 from pydantic import BaseModel, Field

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.inference import OpenAIMessageParam
 from llama_stack.apis.shields import Shield
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -94,7 +94,7 @@ class ShieldStore(Protocol):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Safety(Protocol):
    """Safety

--- a/src/llama_stack/apis/shields/shields.py
+++ b/src/llama_stack/apis/shields/shields.py
@ -8,9 +8,9 @@ from typing import Any, Literal, Protocol, runtime_checkable

 from pydantic import BaseModel

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod


@ -48,7 +48,7 @@ class ListShieldsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class Shields(Protocol):
    @webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1)
    async def list_shields(self) -> ListShieldsResponse:
--- a/src/llama_stack/apis/tools/rag_tool.py
+++ b/src/llama_stack/apis/tools/rag_tool.py
@ -5,18 +5,13 @@
 # the root directory of this source tree.

 from enum import Enum, StrEnum
-from typing import Annotated, Any, Literal, Protocol
+from typing import Annotated, Any, Literal

 from pydantic import BaseModel, Field, field_validator
-from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
-from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


-@json_schema_type
 class RRFRanker(BaseModel):
    """
    Reciprocal Rank Fusion (RRF) ranker configuration.
@ -30,7 +25,6 @@ class RRFRanker(BaseModel):
    impact_factor: float = Field(default=60.0, gt=0.0)  # default of 60 for optimal performance


-@json_schema_type
 class WeightedRanker(BaseModel):
    """
    Weighted ranker configuration that combines vector and keyword scores.
@ -55,10 +49,8 @@ Ranker = Annotated[
    RRFRanker | WeightedRanker,
    Field(discriminator="type"),
 ]
-register_schema(Ranker, name="Ranker")


-@json_schema_type
 class RAGDocument(BaseModel):
    """
    A document to be used for document ingestion in the RAG Tool.
@ -75,7 +67,6 @@ class RAGDocument(BaseModel):
    metadata: dict[str, Any] = Field(default_factory=dict)


-@json_schema_type
 class RAGQueryResult(BaseModel):
    """Result of a RAG query containing retrieved content and metadata.

@ -87,7 +78,6 @@ class RAGQueryResult(BaseModel):
    metadata: dict[str, Any] = Field(default_factory=dict)


-@json_schema_type
 class RAGQueryGenerator(Enum):
    """Types of query generators for RAG systems.

@ -101,7 +91,6 @@ class RAGQueryGenerator(Enum):
    custom = "custom"


-@json_schema_type
 class RAGSearchMode(StrEnum):
    """
    Search modes for RAG query retrieval:
@ -115,7 +104,6 @@ class RAGSearchMode(StrEnum):
    HYBRID = "hybrid"


-@json_schema_type
 class DefaultRAGQueryGeneratorConfig(BaseModel):
    """Configuration for the default RAG query generator.

@ -127,7 +115,6 @@ class DefaultRAGQueryGeneratorConfig(BaseModel):
    separator: str = " "


-@json_schema_type
 class LLMRAGQueryGeneratorConfig(BaseModel):
    """Configuration for the LLM-based RAG query generator.

@ -145,10 +132,8 @@ RAGQueryGeneratorConfig = Annotated[
    DefaultRAGQueryGeneratorConfig | LLMRAGQueryGeneratorConfig,
    Field(discriminator="type"),
 ]
-register_schema(RAGQueryGeneratorConfig, name="RAGQueryGeneratorConfig")


-@json_schema_type
 class RAGQueryConfig(BaseModel):
    """
    Configuration for the RAG query generation.
@ -181,38 +166,3 @@ class RAGQueryConfig(BaseModel):
        if len(v) == 0:
            raise ValueError("chunk_template must not be empty")
        return v
-
-
-@runtime_checkable
-@trace_protocol
-class RAGToolRuntime(Protocol):
-    @webmethod(route="/tool-runtime/rag-tool/insert", method="POST", level=LLAMA_STACK_API_V1)
-    async def insert(
-        self,
-        documents: list[RAGDocument],
-        vector_store_id: str,
-        chunk_size_in_tokens: int = 512,
-    ) -> None:
-        """Index documents so they can be used by the RAG system.
-
-        :param documents: List of documents to index in the RAG system
-        :param vector_store_id: ID of the vector database to store the document embeddings
-        :param chunk_size_in_tokens: (Optional) Size in tokens for document chunking during indexing
-        """
-        ...
-
-    @webmethod(route="/tool-runtime/rag-tool/query", method="POST", level=LLAMA_STACK_API_V1)
-    async def query(
-        self,
-        content: InterleavedContent,
-        vector_store_ids: list[str],
-        query_config: RAGQueryConfig | None = None,
-    ) -> RAGQueryResult:
-        """Query the RAG system for context; typically invoked by the agent.
-
-        :param content: The query content to search for in the indexed documents
-        :param vector_store_ids: List of vector database IDs to search within
-        :param query_config: (Optional) Configuration parameters for the query operation
-        :returns: RAGQueryResult containing the retrieved content and metadata
-        """
-        ...
--- a/src/llama_stack/apis/tools/tools.py
+++ b/src/llama_stack/apis/tools/tools.py
@ -11,13 +11,11 @@ from pydantic import BaseModel
 from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.resource import Resource, ResourceType
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod

-from .rag_tool import RAGToolRuntime
-

@json_schema_type
 class ToolDef(BaseModel):
@ -109,7 +107,7 @@ class ListToolDefsResponse(BaseModel):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class ToolGroups(Protocol):
    @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1)
    async def register_tool_group(
@ -191,12 +189,10 @@ class SpecialToolGroup(Enum):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class ToolRuntime(Protocol):
    tool_store: ToolStore | None = None

-    rag_tool: RAGToolRuntime | None = None
-
    # TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed.
    @webmethod(route="/tool-runtime/list-tools", method="GET", level=LLAMA_STACK_API_V1)
    async def list_runtime_tools(
--- a/src/llama_stack/apis/vector_io/vector_io.py
+++ b/src/llama_stack/apis/vector_io/vector_io.py
@ -13,10 +13,10 @@ from typing import Annotated, Any, Literal, Protocol, runtime_checkable
 from fastapi import Body
 from pydantic import BaseModel, Field

+from llama_stack.apis.common.tracing import telemetry_traceable
 from llama_stack.apis.inference import InterleavedContent
 from llama_stack.apis.vector_stores import VectorStore
 from llama_stack.apis.version import LLAMA_STACK_API_V1
-from llama_stack.core.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, webmethod
 from llama_stack.strong_typing.schema import register_schema

@ -260,7 +260,7 @@ class VectorStoreSearchResponsePage(BaseModel):
    """

    object: str = "vector_store.search_results.page"
-    search_query: str
+    search_query: list[str]
    data: list[VectorStoreSearchResponse]
    has_more: bool = False
    next_page: str | None = None
@ -478,7 +478,7 @@ class OpenAICreateVectorStoreRequestWithExtraBody(BaseModel, extra="allow"):
    name: str | None = None
    file_ids: list[str] | None = None
    expires_after: dict[str, Any] | None = None
-    chunking_strategy: dict[str, Any] | None = None
+    chunking_strategy: VectorStoreChunkingStrategy | None = None
    metadata: dict[str, Any] | None = None


@ -502,7 +502,7 @@ class VectorStoreTable(Protocol):


@runtime_checkable
-@trace_protocol
+@telemetry_traceable
 class VectorIO(Protocol):
    vector_store_table: VectorStoreTable | None = None

--- a/src/llama_stack/cli/stack/list_deps.py
+++ b/src/llama_stack/cli/stack/list_deps.py
@ -46,6 +46,10 @@ class StackListDeps(Subcommand):
    def _run_stack_list_deps_command(self, args: argparse.Namespace) -> None:
        # always keep implementation completely silo-ed away from CLI so CLI
        # can be fast to load and reduces dependencies
+        if not args.config and not args.providers:
+            self.parser.print_help()
+            self.parser.exit()
+
        from ._list_deps import run_stack_list_deps_command

        return run_stack_list_deps_command(args)
--- a/src/llama_stack/cli/stack/list_stacks.py
+++ b/src/llama_stack/cli/stack/list_stacks.py
@ -9,48 +9,69 @@ from pathlib import Path

 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
+from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR


 class StackListBuilds(Subcommand):
-    """List built stacks in .llama/distributions directory"""
+    """List available distributions (both built-in and custom)"""

    def __init__(self, subparsers: argparse._SubParsersAction):
        super().__init__()
        self.parser = subparsers.add_parser(
            "list",
            prog="llama stack list",
-            description="list the build stacks",
+            description="list available distributions",
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        )
        self._add_arguments()
        self.parser.set_defaults(func=self._list_stack_command)

-    def _get_distribution_dirs(self) -> dict[str, Path]:
-        """Return a dictionary of distribution names and their paths"""
-        distributions = {}
-        dist_dir = Path.home() / ".llama" / "distributions"
+    def _get_distribution_dirs(self) -> dict[str, tuple[Path, str]]:
+        """Return a dictionary of distribution names and their paths with source type
+
+        Returns:
+            dict mapping distro name to (path, source_type) where source_type is 'built-in' or 'custom'
+        """
+        distributions = {}
+
+        # Get built-in distributions from source code
+        distro_dir = Path(__file__).parent.parent.parent / "distributions"
+        if distro_dir.exists():
+            for stack_dir in distro_dir.iterdir():
+                if stack_dir.is_dir() and not stack_dir.name.startswith(".") and not stack_dir.name.startswith("__"):
+                    distributions[stack_dir.name] = (stack_dir, "built-in")
+
+        # Get custom/run distributions from ~/.llama/distributions
+        # These override built-in ones if they have the same name
+        if DISTRIBS_BASE_DIR.exists():
+            for stack_dir in DISTRIBS_BASE_DIR.iterdir():
+                if stack_dir.is_dir() and not stack_dir.name.startswith("."):
+                    # Clean up the name (remove llamastack- prefix if present)
+                    name = stack_dir.name.replace("llamastack-", "")
+                    distributions[name] = (stack_dir, "custom")

-        if dist_dir.exists():
-            for stack_dir in dist_dir.iterdir():
-                if stack_dir.is_dir():
-                    distributions[stack_dir.name] = stack_dir
        return distributions

    def _list_stack_command(self, args: argparse.Namespace) -> None:
        distributions = self._get_distribution_dirs()

        if not distributions:
-            print("No stacks found in ~/.llama/distributions")
+            print("No distributions found")
            return

-        headers = ["Stack Name", "Path"]
-        headers.extend(["Build Config", "Run Config"])
+        headers = ["Stack Name", "Source", "Path", "Build Config", "Run Config"]
        rows = []
-        for name, path in distributions.items():
-            row = [name, str(path)]
+        for name, (path, source_type) in sorted(distributions.items()):
+            row = [name, source_type, str(path)]
            # Check for build and run config files
-            build_config = "Yes" if (path / f"{name}-build.yaml").exists() else "No"
-            run_config = "Yes" if (path / f"{name}-run.yaml").exists() else "No"
+            # For built-in distributions, configs are named build.yaml and run.yaml
+            # For custom distributions, configs are named {name}-build.yaml and {name}-run.yaml
+            if source_type == "built-in":
+                build_config = "Yes" if (path / "build.yaml").exists() else "No"
+                run_config = "Yes" if (path / "run.yaml").exists() else "No"
+            else:
+                build_config = "Yes" if (path / f"{name}-build.yaml").exists() else "No"
+                run_config = "Yes" if (path / f"{name}-run.yaml").exists() else "No"
            row.extend([build_config, run_config])
            rows.append(row)
        print_table(rows, headers, separate_rows=True)
--- a/src/llama_stack/cli/stack/run.py
+++ b/src/llama_stack/cli/stack/run.py
@ -253,7 +253,7 @@ class StackRun(Subcommand):
            )
            return

-        ui_dir = REPO_ROOT / "llama_stack" / "ui"
+        ui_dir = REPO_ROOT / "llama_stack_ui"
        logs_dir = Path("~/.llama/ui/logs").expanduser()
        try:
            # Create logs directory if it doesn't exist
--- a/src/llama_stack/core/library_client.py
+++ b/src/llama_stack/core/library_client.py
@ -18,14 +18,21 @@ from typing import Any, TypeVar, Union, get_args, get_origin
 import httpx
 import yaml
 from fastapi import Response as FastAPIResponse
-from llama_stack_client import (
-    NOT_GIVEN,
-    APIResponse,
-    AsyncAPIResponse,
-    AsyncLlamaStackClient,
-    AsyncStream,
-    LlamaStackClient,
-)
+
+try:
+    from llama_stack_client import (
+        NOT_GIVEN,
+        APIResponse,
+        AsyncAPIResponse,
+        AsyncLlamaStackClient,
+        AsyncStream,
+        LlamaStackClient,
+    )
+except ImportError as e:
+    raise ImportError(
+        "llama-stack-client is not installed. Please install it with `uv pip install llama-stack[client]`."
+    ) from e
+
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
 from termcolor import cprint
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@ -397,6 +397,18 @@ async def instantiate_provider(
    impl.__provider_spec__ = provider_spec
    impl.__provider_config__ = config

+    # Apply tracing if telemetry is enabled and any base class has __marked_for_tracing__ marker
+    if run_config.telemetry.enabled:
+        traced_classes = [
+            base for base in reversed(impl.__class__.__mro__) if getattr(base, "__marked_for_tracing__", False)
+        ]
+
+        if traced_classes:
+            from llama_stack.core.telemetry.trace_protocol import trace_protocol
+
+            for cls in traced_classes:
+                trace_protocol(cls)
+
    protocols = api_protocol_map_for_compliance_check(run_config)
    additional_protocols = additional_protocols_map()
    # TODO: check compliance for special tool groups
--- a/src/llama_stack/core/routers/init.py
+++ b/src/llama_stack/core/routers/init.py
@ -45,6 +45,7 @@ async def get_routing_table_impl(
        raise ValueError(f"API {api.value} not found in router map")

    impl = api_to_tables[api.value](impls_by_provider_id, dist_registry, policy)
+
    await impl.initialize()
    return impl

@ -92,5 +93,6 @@ async def get_auto_router_impl(
        api_to_dep_impl["safety_config"] = run_config.safety

    impl = api_to_routers[api.value](routing_table, **api_to_dep_impl)
+
    await impl.initialize()
    return impl
--- a/src/llama_stack/core/routers/inference.py
+++ b/src/llama_stack/core/routers/inference.py
@ -190,7 +190,7 @@ class InferenceRouter(Inference):

        response = await provider.openai_completion(params)
        response.model = request_model_id
-        if self.telemetry_enabled:
+        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
@ -253,7 +253,7 @@ class InferenceRouter(Inference):
        if self.store:
            asyncio.create_task(self.store.store_chat_completion(response, params.messages))

-        if self.telemetry_enabled:
+        if self.telemetry_enabled and response.usage is not None:
            metrics = self._construct_metrics(
                prompt_tokens=response.usage.prompt_tokens,
                completion_tokens=response.usage.completion_tokens,
--- a/src/llama_stack/core/routers/tool_runtime.py
+++ b/src/llama_stack/core/routers/tool_runtime.py
@ -8,14 +8,9 @@ from typing import Any

 from llama_stack.apis.common.content_types import (
    URL,
-    InterleavedContent,
 )
 from llama_stack.apis.tools import (
    ListToolDefsResponse,
-    RAGDocument,
-    RAGQueryConfig,
-    RAGQueryResult,
-    RAGToolRuntime,
    ToolRuntime,
 )
 from llama_stack.log import get_logger
@ -26,36 +21,6 @@ logger = get_logger(name=__name__, category="core::routers")


 class ToolRuntimeRouter(ToolRuntime):
-    class RagToolImpl(RAGToolRuntime):
-        def __init__(
-            self,
-            routing_table: ToolGroupsRoutingTable,
-        ) -> None:
-            logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
-            self.routing_table = routing_table
-
-        async def query(
-            self,
-            content: InterleavedContent,
-            vector_store_ids: list[str],
-            query_config: RAGQueryConfig | None = None,
-        ) -> RAGQueryResult:
-            logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_store_ids}")
-            provider = await self.routing_table.get_provider_impl("knowledge_search")
-            return await provider.query(content, vector_store_ids, query_config)
-
-        async def insert(
-            self,
-            documents: list[RAGDocument],
-            vector_store_id: str,
-            chunk_size_in_tokens: int = 512,
-        ) -> None:
-            logger.debug(
-                f"ToolRuntimeRouter.RagToolImpl.insert: {vector_store_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
-            )
-            provider = await self.routing_table.get_provider_impl("insert_into_memory")
-            return await provider.insert(documents, vector_store_id, chunk_size_in_tokens)
-
    def __init__(
        self,
        routing_table: ToolGroupsRoutingTable,
@ -63,11 +28,6 @@ class ToolRuntimeRouter(ToolRuntime):
        logger.debug("Initializing ToolRuntimeRouter")
        self.routing_table = routing_table

-        # HACK ALERT this should be in sync with "get_all_api_endpoints()"
-        self.rag_tool = self.RagToolImpl(routing_table)
-        for method in ("query", "insert"):
-            setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
-
    async def initialize(self) -> None:
        logger.debug("ToolRuntimeRouter.initialize")
        pass
--- a/src/llama_stack/core/routers/vector_io.py
+++ b/src/llama_stack/core/routers/vector_io.py
@ -20,6 +20,8 @@ from llama_stack.apis.vector_io import (
    SearchRankingOptions,
    VectorIO,
    VectorStoreChunkingStrategy,
+    VectorStoreChunkingStrategyStatic,
+    VectorStoreChunkingStrategyStaticConfig,
    VectorStoreDeleteResponse,
    VectorStoreFileBatchObject,
    VectorStoreFileContentsResponse,
@ -167,6 +169,13 @@ class VectorIORouter(VectorIO):
        if embedding_dimension is not None:
            params.model_extra["embedding_dimension"] = embedding_dimension

+        # Set chunking strategy explicitly if not provided
+        if params.chunking_strategy is None or params.chunking_strategy.type == "auto":
+            # actualize the chunking strategy to static
+            params.chunking_strategy = VectorStoreChunkingStrategyStatic(
+                static=VectorStoreChunkingStrategyStaticConfig()
+            )
+
        return await provider.openai_create_vector_store(params)

    async def openai_list_vector_stores(
@ -283,6 +292,8 @@ class VectorIORouter(VectorIO):
        chunking_strategy: VectorStoreChunkingStrategy | None = None,
    ) -> VectorStoreFileObject:
        logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
+        if chunking_strategy is None or chunking_strategy.type == "auto":
+            chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
        provider = await self.routing_table.get_provider_impl(vector_store_id)
        return await provider.openai_attach_file_to_vector_store(
            vector_store_id=vector_store_id,
--- a/src/llama_stack/core/server/routes.py
+++ b/src/llama_stack/core/server/routes.py
@ -13,7 +13,6 @@ from aiohttp import hdrs
 from starlette.routing import Route

 from llama_stack.apis.datatypes import Api, ExternalApiSpec
-from llama_stack.apis.tools import RAGToolRuntime, SpecialToolGroup
 from llama_stack.core.resolver import api_protocol_map
 from llama_stack.schema_utils import WebMethod

@ -25,33 +24,16 @@ RouteImpls = dict[str, PathImpl]
 RouteMatch = tuple[EndpointFunc, PathParams, str, WebMethod]


-def toolgroup_protocol_map():
-    return {
-        SpecialToolGroup.rag_tool: RAGToolRuntime,
-    }
-
-
 def get_all_api_routes(
    external_apis: dict[Api, ExternalApiSpec] | None = None,
 ) -> dict[Api, list[tuple[Route, WebMethod]]]:
    apis = {}

    protocols = api_protocol_map(external_apis)
-    toolgroup_protocols = toolgroup_protocol_map()
    for api, protocol in protocols.items():
        routes = []
        protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction)

-        # HACK ALERT
-        if api == Api.tool_runtime:
-            for tool_group in SpecialToolGroup:
-                sub_protocol = toolgroup_protocols[tool_group]
-                sub_protocol_methods = inspect.getmembers(sub_protocol, predicate=inspect.isfunction)
-                for name, method in sub_protocol_methods:
-                    if not hasattr(method, "__webmethod__"):
-                        continue
-                    protocol_methods.append((f"{tool_group.value}.{name}", method))
-
        for name, method in protocol_methods:
            # Get all webmethods for this method (supports multiple decorators)
            webmethods = getattr(method, "__webmethods__", [])
--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@ -31,7 +31,7 @@ from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFunctions
 from llama_stack.apis.shields import Shields
-from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
+from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.conversations.conversations import ConversationServiceConfig, ConversationServiceImpl
 from llama_stack.core.datatypes import Provider, SafetyConfig, StackRunConfig, VectorStoresConfig
@ -78,7 +78,6 @@ class LlamaStack(
    Inspect,
    ToolGroups,
    ToolRuntime,
-    RAGToolRuntime,
    Files,
    Prompts,
    Conversations,
--- a/src/llama_stack/core/telemetry/telemetry.py
+++ b/src/llama_stack/core/telemetry/telemetry.py
@ -163,47 +163,6 @@ class MetricEvent(EventCommon):
    unit: str


-@json_schema_type
-class MetricInResponse(BaseModel):
-    """A metric value included in API responses.
-    :param metric: The name of the metric
-    :param value: The numeric value of the metric
-    :param unit: (Optional) The unit of measurement for the metric value
-    """
-
-    metric: str
-    value: int | float
-    unit: str | None = None
-
-
-# This is a short term solution to allow inference API to return metrics
-# The ideal way to do this is to have a way for all response types to include metrics
-# and all metric events logged to the telemetry API to be included with the response
-# To do this, we will need to augment all response types with a metrics field.
-# We have hit a blocker from stainless SDK that prevents us from doing this.
-# The blocker is that if we were to augment the response types that have a data field
-# in them like so
-# class ListModelsResponse(BaseModel):
-# metrics: Optional[List[MetricEvent]] = None
-# data: List[Models]
-# ...
-# The client SDK will need to access the data by using a .data field, which is not
-# ergonomic. Stainless SDK does support unwrapping the response type, but it
-# requires that the response type to only have a single field.
-
-# We will need a way in the client SDK to signal that the metrics are needed
-# and if they are needed, the client SDK has to return the full response type
-# without unwrapping it.
-
-
-class MetricResponseMixin(BaseModel):
-    """Mixin class for API responses that can include metrics.
-    :param metrics: (Optional) List of metrics associated with the API response
-    """
-
-    metrics: list[MetricInResponse] | None = None
-
-
@json_schema_type
 class StructuredLogType(Enum):
    """The type of structured log event payload.
@ -427,6 +386,7 @@ _GLOBAL_STORAGE: dict[str, dict[str | int, Any]] = {
    "counters": {},
    "gauges": {},
    "up_down_counters": {},
+    "histograms": {},
 }
 _global_lock = threading.Lock()
 _TRACER_PROVIDER = None
@ -540,6 +500,16 @@ class Telemetry:
            )
        return cast(metrics.ObservableGauge, _GLOBAL_STORAGE["gauges"][name])

+    def _get_or_create_histogram(self, name: str, unit: str) -> metrics.Histogram:
+        assert self.meter is not None
+        if name not in _GLOBAL_STORAGE["histograms"]:
+            _GLOBAL_STORAGE["histograms"][name] = self.meter.create_histogram(
+                name=name,
+                unit=unit,
+                description=f"Histogram for {name}",
+            )
+        return cast(metrics.Histogram, _GLOBAL_STORAGE["histograms"][name])
+
    def _log_metric(self, event: MetricEvent) -> None:
        # Add metric as an event to the current span
        try:
@ -571,7 +541,16 @@ class Telemetry:
        # Log to OpenTelemetry meter if available
        if self.meter is None:
            return
-        if isinstance(event.value, int):
+
+        # Use histograms for token-related metrics (per-request measurements)
+        # Use counters for other cumulative metrics
+        token_metrics = {"prompt_tokens", "completion_tokens", "total_tokens"}
+
+        if event.metric in token_metrics:
+            # Token metrics are per-request measurements, use histogram
+            histogram = self._get_or_create_histogram(event.metric, event.unit)
+            histogram.record(event.value, attributes=_clean_attributes(event.attributes))
+        elif isinstance(event.value, int):
            counter = self._get_or_create_counter(event.metric, event.unit)
            counter.add(event.value, attributes=_clean_attributes(event.attributes))
        elif isinstance(event.value, float):
--- a/src/llama_stack/core/telemetry/trace_protocol.py
+++ b/src/llama_stack/core/telemetry/trace_protocol.py
@ -129,6 +129,15 @@ def trace_protocol[T: type[Any]](cls: T) -> T:
        else:
            return sync_wrapper

+    # Wrap methods on the class itself (for classes applied at runtime)
+    # Skip if already wrapped (indicated by __wrapped__ attribute)
+    for name, method in vars(cls).items():
+        if inspect.isfunction(method) and not name.startswith("_"):
+            if not hasattr(method, "__wrapped__"):
+                wrapped = trace_method(method)
+                setattr(cls, name, wrapped)  # noqa: B010
+
+    # Also set up __init_subclass__ for future subclasses
    original_init_subclass = cast(Callable[..., Any] | None, getattr(cls, "__init_subclass__", None))

    def __init_subclass__(cls_child: type[Any], **kwargs: Any) -> None:  # noqa: N807
--- a/src/llama_stack/core/ui/Containerfile
+++ b/src/llama_stack/core/ui/Containerfile
@ -1,11 +0,0 @@
-# More info on playground configuration can be found here:
-# https://llama-stack.readthedocs.io/en/latest/playground
-
-FROM python:3.12-slim
-WORKDIR /app
-COPY . /app/
-RUN /usr/local/bin/python -m pip install --upgrade pip && \
-    /usr/local/bin/pip3 install -r requirements.txt
-EXPOSE 8501
-
-ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
--- a/src/llama_stack/core/ui/README.md
+++ b/src/llama_stack/core/ui/README.md
@ -1,50 +0,0 @@
-# (Experimental) LLama Stack UI
-
-## Docker Setup
-
-:warning: This is a work in progress.
-
-## Developer Setup
-
-1. Start up Llama Stack API server. More details [here](https://llamastack.github.io/latest/getting_started/index.htmll).
-
-```
-llama stack list-deps together | xargs -L1 uv pip install
-
-llama stack run together
-```
-
-2. (Optional) Register datasets and eval tasks as resources. If you want to run pre-configured evaluation flows (e.g. Evaluations (Generation + Scoring) Page).
-
-```bash
-llama-stack-client datasets register \
--dataset-id "mmlu" \
--provider-id "huggingface" \
--url "https://huggingface.co/datasets/llamastack/evals" \
--metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
--schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}'
-```
-
-```bash
-llama-stack-client benchmarks register \
--eval-task-id meta-reference-mmlu \
--provider-id meta-reference \
--dataset-id mmlu \
--scoring-functions basic::regex_parser_multiple_choice_answer
-```
-
-3. Start Streamlit UI
-
-```bash
-uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
-```
-
-## Environment Variables
-
-| Environment Variable       | Description                        | Default Value             |
-|----------------------------|------------------------------------|---------------------------|
-| LLAMA_STACK_ENDPOINT       | The endpoint for the Llama Stack   | http://localhost:8321     |
-| FIREWORKS_API_KEY          | API key for Fireworks provider     | (empty string)            |
-| TOGETHER_API_KEY           | API key for Together provider      | (empty string)            |
-| SAMBANOVA_API_KEY          | API key for SambaNova provider     | (empty string)            |
-| OPENAI_API_KEY             | API key for OpenAI provider        | (empty string)            |
--- a/src/llama_stack/core/ui/app.py
+++ b/src/llama_stack/core/ui/app.py
@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import streamlit as st
-
-
-def main():
-    # Evaluation pages
-    application_evaluation_page = st.Page(
-        "page/evaluations/app_eval.py",
-        title="Evaluations (Scoring)",
-        icon="📊",
-        default=False,
-    )
-    native_evaluation_page = st.Page(
-        "page/evaluations/native_eval.py",
-        title="Evaluations (Generation + Scoring)",
-        icon="📊",
-        default=False,
-    )
-
-    # Playground pages
-    chat_page = st.Page("page/playground/chat.py", title="Chat", icon="💬", default=True)
-    rag_page = st.Page("page/playground/rag.py", title="RAG", icon="💬", default=False)
-    tool_page = st.Page("page/playground/tools.py", title="Tools", icon="🛠", default=False)
-
-    # Distribution pages
-    resources_page = st.Page("page/distribution/resources.py", title="Resources", icon="🔍", default=False)
-    provider_page = st.Page(
-        "page/distribution/providers.py",
-        title="API Providers",
-        icon="🔍",
-        default=False,
-    )
-
-    pg = st.navigation(
-        {
-            "Playground": [
-                chat_page,
-                rag_page,
-                tool_page,
-                application_evaluation_page,
-                native_evaluation_page,
-            ],
-            "Inspect": [provider_page, resources_page],
-        },
-        expanded=False,
-    )
-    pg.run()
-
-
-if __name__ == "__main__":
-    main()
--- a/src/llama_stack/core/ui/modules/init.py
+++ b/src/llama_stack/core/ui/modules/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/modules/api.py
+++ b/src/llama_stack/core/ui/modules/api.py
@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-
-from llama_stack_client import LlamaStackClient
-
-
-class LlamaStackApi:
-    def __init__(self):
-        self.client = LlamaStackClient(
-            base_url=os.environ.get("LLAMA_STACK_ENDPOINT", "http://localhost:8321"),
-            provider_data={
-                "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
-                "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),
-                "sambanova_api_key": os.environ.get("SAMBANOVA_API_KEY", ""),
-                "openai_api_key": os.environ.get("OPENAI_API_KEY", ""),
-                "tavily_search_api_key": os.environ.get("TAVILY_SEARCH_API_KEY", ""),
-            },
-        )
-
-    def run_scoring(self, row, scoring_function_ids: list[str], scoring_params: dict | None):
-        """Run scoring on a single row"""
-        if not scoring_params:
-            scoring_params = dict.fromkeys(scoring_function_ids)
-        return self.client.scoring.score(input_rows=[row], scoring_functions=scoring_params)
-
-
-llama_stack_api = LlamaStackApi()
--- a/src/llama_stack/core/ui/modules/utils.py
+++ b/src/llama_stack/core/ui/modules/utils.py
@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import base64
-import os
-
-import pandas as pd
-import streamlit as st
-
-
-def process_dataset(file):
-    if file is None:
-        return "No file uploaded", None
-
-    try:
-        # Determine file type and read accordingly
-        file_ext = os.path.splitext(file.name)[1].lower()
-        if file_ext == ".csv":
-            df = pd.read_csv(file)
-        elif file_ext in [".xlsx", ".xls"]:
-            df = pd.read_excel(file)
-        else:
-            return "Unsupported file format. Please upload a CSV or Excel file.", None
-
-        return df
-
-    except Exception as e:
-        st.error(f"Error processing file: {str(e)}")
-        return None
-
-
-def data_url_from_file(file) -> str:
-    file_content = file.getvalue()
-    base64_content = base64.b64encode(file_content).decode("utf-8")
-    mime_type = file.type
-
-    data_url = f"data:{mime_type};base64,{base64_content}"
-
-    return data_url
--- a/src/llama_stack/core/ui/page/init.py
+++ b/src/llama_stack/core/ui/page/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/distribution/init.py
+++ b/src/llama_stack/core/ui/page/distribution/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/distribution/datasets.py
+++ b/src/llama_stack/core/ui/page/distribution/datasets.py
@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def datasets():
-    st.header("Datasets")
-
-    datasets_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.datasets.list()}
-    if len(datasets_info) > 0:
-        selected_dataset = st.selectbox("Select a dataset", list(datasets_info.keys()))
-        st.json(datasets_info[selected_dataset], expanded=True)
--- a/src/llama_stack/core/ui/page/distribution/eval_tasks.py
+++ b/src/llama_stack/core/ui/page/distribution/eval_tasks.py
@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def benchmarks():
-    # Benchmarks Section
-    st.header("Benchmarks")
-
-    benchmarks_info = {d.identifier: d.to_dict() for d in llama_stack_api.client.benchmarks.list()}
-
-    if len(benchmarks_info) > 0:
-        selected_benchmark = st.selectbox("Select an eval task", list(benchmarks_info.keys()), key="benchmark_inspect")
-        st.json(benchmarks_info[selected_benchmark], expanded=True)
--- a/src/llama_stack/core/ui/page/distribution/models.py
+++ b/src/llama_stack/core/ui/page/distribution/models.py
@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def models():
-    # Models Section
-    st.header("Models")
-    models_info = {m.id: m.model_dump() for m in llama_stack_api.client.models.list()}
-
-    selected_model = st.selectbox("Select a model", list(models_info.keys()))
-    st.json(models_info[selected_model])
--- a/src/llama_stack/core/ui/page/distribution/providers.py
+++ b/src/llama_stack/core/ui/page/distribution/providers.py
@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def providers():
-    st.header("🔍 API Providers")
-    apis_providers_lst = llama_stack_api.client.providers.list()
-    api_to_providers = {}
-    for api_provider in apis_providers_lst:
-        if api_provider.api in api_to_providers:
-            api_to_providers[api_provider.api].append(api_provider)
-        else:
-            api_to_providers[api_provider.api] = [api_provider]
-
-    for api in api_to_providers.keys():
-        st.markdown(f"###### {api}")
-        st.dataframe([x.to_dict() for x in api_to_providers[api]], width=500)
-
-
-providers()
--- a/src/llama_stack/core/ui/page/distribution/resources.py
+++ b/src/llama_stack/core/ui/page/distribution/resources.py
@ -1,48 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from streamlit_option_menu import option_menu
-
-from llama_stack.core.ui.page.distribution.datasets import datasets
-from llama_stack.core.ui.page.distribution.eval_tasks import benchmarks
-from llama_stack.core.ui.page.distribution.models import models
-from llama_stack.core.ui.page.distribution.scoring_functions import scoring_functions
-from llama_stack.core.ui.page.distribution.shields import shields
-
-
-def resources_page():
-    options = [
-        "Models",
-        "Shields",
-        "Scoring Functions",
-        "Datasets",
-        "Benchmarks",
-    ]
-    icons = ["magic", "shield", "file-bar-graph", "database", "list-task"]
-    selected_resource = option_menu(
-        None,
-        options,
-        icons=icons,
-        orientation="horizontal",
-        styles={
-            "nav-link": {
-                "font-size": "12px",
-            },
-        },
-    )
-    if selected_resource == "Benchmarks":
-        benchmarks()
-    elif selected_resource == "Datasets":
-        datasets()
-    elif selected_resource == "Models":
-        models()
-    elif selected_resource == "Scoring Functions":
-        scoring_functions()
-    elif selected_resource == "Shields":
-        shields()
-
-
-resources_page()
--- a/src/llama_stack/core/ui/page/distribution/scoring_functions.py
+++ b/src/llama_stack/core/ui/page/distribution/scoring_functions.py
@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def scoring_functions():
-    st.header("Scoring Functions")
-
-    scoring_functions_info = {s.identifier: s.to_dict() for s in llama_stack_api.client.scoring_functions.list()}
-
-    selected_scoring_function = st.selectbox("Select a scoring function", list(scoring_functions_info.keys()))
-    st.json(scoring_functions_info[selected_scoring_function], expanded=True)
--- a/src/llama_stack/core/ui/page/distribution/shields.py
+++ b/src/llama_stack/core/ui/page/distribution/shields.py
@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def shields():
-    # Shields Section
-    st.header("Shields")
-
-    shields_info = {s.identifier: s.to_dict() for s in llama_stack_api.client.shields.list()}
-
-    selected_shield = st.selectbox("Select a shield", list(shields_info.keys()))
-    st.json(shields_info[selected_shield])
--- a/src/llama_stack/core/ui/page/evaluations/init.py
+++ b/src/llama_stack/core/ui/page/evaluations/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/evaluations/app_eval.py
+++ b/src/llama_stack/core/ui/page/evaluations/app_eval.py
@ -1,143 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-
-import pandas as pd
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-from llama_stack.core.ui.modules.utils import process_dataset
-
-
-def application_evaluation_page():
-    st.set_page_config(page_title="Evaluations (Scoring)", page_icon="🦙")
-    st.title("📊 Evaluations (Scoring)")
-
-    # File uploader
-    uploaded_file = st.file_uploader("Upload Dataset", type=["csv", "xlsx", "xls"])
-
-    if uploaded_file is None:
-        st.error("No file uploaded")
-        return
-
-    # Process uploaded file
-    df = process_dataset(uploaded_file)
-    if df is None:
-        st.error("Error processing file")
-        return
-
-    # Display dataset information
-    st.success("Dataset loaded successfully!")
-
-    # Display dataframe preview
-    st.subheader("Dataset Preview")
-    st.dataframe(df)
-
-    # Select Scoring Functions to Run Evaluation On
-    st.subheader("Select Scoring Functions")
-    scoring_functions = llama_stack_api.client.scoring_functions.list()
-    scoring_functions = {sf.identifier: sf for sf in scoring_functions}
-    scoring_functions_names = list(scoring_functions.keys())
-    selected_scoring_functions = st.multiselect(
-        "Choose one or more scoring functions",
-        options=scoring_functions_names,
-        help="Choose one or more scoring functions.",
-    )
-
-    available_models = llama_stack_api.client.models.list()
-    available_models = [m.identifier for m in available_models]
-
-    scoring_params = {}
-    if selected_scoring_functions:
-        st.write("Selected:")
-        for scoring_fn_id in selected_scoring_functions:
-            scoring_fn = scoring_functions[scoring_fn_id]
-            st.write(f"- **{scoring_fn_id}**: {scoring_fn.description}")
-            new_params = None
-            if scoring_fn.params:
-                new_params = {}
-                for param_name, param_value in scoring_fn.params.to_dict().items():
-                    if param_name == "type":
-                        new_params[param_name] = param_value
-                        continue
-
-                    if param_name == "judge_model":
-                        value = st.selectbox(
-                            f"Select **{param_name}** for {scoring_fn_id}",
-                            options=available_models,
-                            index=0,
-                            key=f"{scoring_fn_id}_{param_name}",
-                        )
-                        new_params[param_name] = value
-                    else:
-                        value = st.text_area(
-                            f"Enter value for **{param_name}** in {scoring_fn_id} in valid JSON format",
-                            value=json.dumps(param_value, indent=2),
-                            height=80,
-                        )
-                        try:
-                            new_params[param_name] = json.loads(value)
-                        except json.JSONDecodeError:
-                            st.error(f"Invalid JSON for **{param_name}** in {scoring_fn_id}")
-
-                st.json(new_params)
-            scoring_params[scoring_fn_id] = new_params
-
-        # Add run evaluation button & slider
-        total_rows = len(df)
-        num_rows = st.slider("Number of rows to evaluate", 1, total_rows, total_rows)
-
-        if st.button("Run Evaluation"):
-            progress_text = "Running evaluation..."
-            progress_bar = st.progress(0, text=progress_text)
-            rows = df.to_dict(orient="records")
-            if num_rows < total_rows:
-                rows = rows[:num_rows]
-
-            # Create separate containers for progress text and results
-            progress_text_container = st.empty()
-            results_container = st.empty()
-            output_res = {}
-            for i, r in enumerate(rows):
-                # Update progress
-                progress = i / len(rows)
-                progress_bar.progress(progress, text=progress_text)
-
-                # Run evaluation for current row
-                score_res = llama_stack_api.run_scoring(
-                    r,
-                    scoring_function_ids=selected_scoring_functions,
-                    scoring_params=scoring_params,
-                )
-
-                for k in r.keys():
-                    if k not in output_res:
-                        output_res[k] = []
-                    output_res[k].append(r[k])
-
-                for fn_id in selected_scoring_functions:
-                    if fn_id not in output_res:
-                        output_res[fn_id] = []
-                    output_res[fn_id].append(score_res.results[fn_id].score_rows[0])
-
-                # Display current row results using separate containers
-                progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})")
-                results_container.json(
-                    score_res.to_json(),
-                    expanded=2,
-                )
-
-            progress_bar.progress(1.0, text="Evaluation complete!")
-
-            # Display results in dataframe
-            if output_res:
-                output_df = pd.DataFrame(output_res)
-                st.subheader("Evaluation Results")
-                st.dataframe(output_df)
-
-
-application_evaluation_page()
--- a/src/llama_stack/core/ui/page/evaluations/native_eval.py
+++ b/src/llama_stack/core/ui/page/evaluations/native_eval.py
@ -1,253 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-
-import pandas as pd
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-def select_benchmark_1():
-    # Select Benchmarks
-    st.subheader("1. Choose An Eval Task")
-    benchmarks = llama_stack_api.client.benchmarks.list()
-    benchmarks = {et.identifier: et for et in benchmarks}
-    benchmarks_names = list(benchmarks.keys())
-    selected_benchmark = st.selectbox(
-        "Choose an eval task.",
-        options=benchmarks_names,
-        help="Choose an eval task. Each eval task is parameterized by a dataset, and list of scoring functions.",
-    )
-    with st.expander("View Eval Task"):
-        st.json(benchmarks[selected_benchmark], expanded=True)
-
-    st.session_state["selected_benchmark"] = selected_benchmark
-    st.session_state["benchmarks"] = benchmarks
-    if st.button("Confirm", key="confirm_1"):
-        st.session_state["selected_benchmark_1_next"] = True
-
-
-def define_eval_candidate_2():
-    if not st.session_state.get("selected_benchmark_1_next", None):
-        return
-
-    st.subheader("2. Define Eval Candidate")
-    st.info(
-        """
-        Define the configurations for the evaluation candidate model or agent used for generation.
-        Select "model" if you want to run generation with inference API, or "agent" if you want to run generation with agent API through specifying AgentConfig.
-        """
-    )
-    with st.expander("Define Eval Candidate", expanded=True):
-        # Define Eval Candidate
-        candidate_type = st.radio("Candidate Type", ["model", "agent"])
-
-        available_models = llama_stack_api.client.models.list()
-        available_models = [model.identifier for model in available_models]
-        selected_model = st.selectbox(
-            "Choose a model",
-            available_models,
-            index=0,
-        )
-
-        # Sampling Parameters
-        st.markdown("##### Sampling Parameters")
-        temperature = st.slider(
-            "Temperature",
-            min_value=0.0,
-            max_value=1.0,
-            value=0.0,
-            step=0.1,
-            help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
-        )
-        top_p = st.slider(
-            "Top P",
-            min_value=0.0,
-            max_value=1.0,
-            value=0.95,
-            step=0.1,
-        )
-        max_tokens = st.slider(
-            "Max Tokens",
-            min_value=0,
-            max_value=4096,
-            value=512,
-            step=1,
-            help="The maximum number of tokens to generate",
-        )
-        repetition_penalty = st.slider(
-            "Repetition Penalty",
-            min_value=1.0,
-            max_value=2.0,
-            value=1.0,
-            step=0.1,
-            help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
-        )
-        if candidate_type == "model":
-            if temperature > 0.0:
-                strategy = {
-                    "type": "top_p",
-                    "temperature": temperature,
-                    "top_p": top_p,
-                }
-            else:
-                strategy = {"type": "greedy"}
-
-            eval_candidate = {
-                "type": "model",
-                "model": selected_model,
-                "sampling_params": {
-                    "strategy": strategy,
-                    "max_tokens": max_tokens,
-                    "repetition_penalty": repetition_penalty,
-                },
-            }
-        elif candidate_type == "agent":
-            system_prompt = st.text_area(
-                "System Prompt",
-                value="You are a helpful AI assistant.",
-                help="Initial instructions given to the AI to set its behavior and context",
-            )
-            tools_json = st.text_area(
-                "Tools Configuration (JSON)",
-                value=json.dumps(
-                    [
-                        {
-                            "type": "brave_search",
-                            "engine": "brave",
-                            "api_key": "ENTER_BRAVE_API_KEY_HERE",
-                        }
-                    ]
-                ),
-                help="Enter tool configurations in JSON format. Each tool should have a name, description, and parameters.",
-                height=200,
-            )
-            try:
-                tools = json.loads(tools_json)
-            except json.JSONDecodeError:
-                st.error("Invalid JSON format for tools configuration")
-                tools = []
-            eval_candidate = {
-                "type": "agent",
-                "config": {
-                    "model": selected_model,
-                    "instructions": system_prompt,
-                    "tools": tools,
-                    "tool_choice": "auto",
-                    "tool_prompt_format": "json",
-                    "input_shields": [],
-                    "output_shields": [],
-                    "enable_session_persistence": False,
-                },
-            }
-        st.session_state["eval_candidate"] = eval_candidate
-
-    if st.button("Confirm", key="confirm_2"):
-        st.session_state["selected_eval_candidate_2_next"] = True
-
-
-def run_evaluation_3():
-    if not st.session_state.get("selected_eval_candidate_2_next", None):
-        return
-
-    st.subheader("3. Run Evaluation")
-    # Add info box to explain configurations being used
-    st.info(
-        """
-        Review the configurations that will be used for this evaluation run, make any necessary changes, and then click the "Run Evaluation" button.
-        """
-    )
-    selected_benchmark = st.session_state["selected_benchmark"]
-    benchmarks = st.session_state["benchmarks"]
-    eval_candidate = st.session_state["eval_candidate"]
-
-    dataset_id = benchmarks[selected_benchmark].dataset_id
-    rows = llama_stack_api.client.datasets.iterrows(
-        dataset_id=dataset_id,
-    )
-    total_rows = len(rows.data)
-    # Add number of examples control
-    num_rows = st.number_input(
-        "Number of Examples to Evaluate",
-        min_value=1,
-        max_value=total_rows,
-        value=5,
-        help="Number of examples from the dataset to evaluate. ",
-    )
-
-    benchmark_config = {
-        "type": "benchmark",
-        "eval_candidate": eval_candidate,
-        "scoring_params": {},
-    }
-
-    with st.expander("View Evaluation Task", expanded=True):
-        st.json(benchmarks[selected_benchmark], expanded=True)
-    with st.expander("View Evaluation Task Configuration", expanded=True):
-        st.json(benchmark_config, expanded=True)
-
-    # Add run button and handle evaluation
-    if st.button("Run Evaluation"):
-        progress_text = "Running evaluation..."
-        progress_bar = st.progress(0, text=progress_text)
-        rows = rows.data
-        if num_rows < total_rows:
-            rows = rows[:num_rows]
-
-        # Create separate containers for progress text and results
-        progress_text_container = st.empty()
-        results_container = st.empty()
-        output_res = {}
-        for i, r in enumerate(rows):
-            # Update progress
-            progress = i / len(rows)
-            progress_bar.progress(progress, text=progress_text)
-            # Run evaluation for current row
-            eval_res = llama_stack_api.client.eval.evaluate_rows(
-                benchmark_id=selected_benchmark,
-                input_rows=[r],
-                scoring_functions=benchmarks[selected_benchmark].scoring_functions,
-                benchmark_config=benchmark_config,
-            )
-
-            for k in r.keys():
-                if k not in output_res:
-                    output_res[k] = []
-                output_res[k].append(r[k])
-
-            for k in eval_res.generations[0].keys():
-                if k not in output_res:
-                    output_res[k] = []
-                output_res[k].append(eval_res.generations[0][k])
-
-            for scoring_fn in benchmarks[selected_benchmark].scoring_functions:
-                if scoring_fn not in output_res:
-                    output_res[scoring_fn] = []
-                output_res[scoring_fn].append(eval_res.scores[scoring_fn].score_rows[0])
-
-            progress_text_container.write(f"Expand to see current processed result ({i + 1} / {len(rows)})")
-            results_container.json(eval_res, expanded=2)
-
-        progress_bar.progress(1.0, text="Evaluation complete!")
-        # Display results in dataframe
-        if output_res:
-            output_df = pd.DataFrame(output_res)
-            st.subheader("Evaluation Results")
-            st.dataframe(output_df)
-
-
-def native_evaluation_page():
-    st.set_page_config(page_title="Evaluations (Generation + Scoring)", page_icon="🦙")
-    st.title("📊 Evaluations (Generation + Scoring)")
-
-    select_benchmark_1()
-    define_eval_candidate_2()
-    run_evaluation_3()
-
-
-native_evaluation_page()
--- a/src/llama_stack/core/ui/page/playground/init.py
+++ b/src/llama_stack/core/ui/page/playground/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/src/llama_stack/core/ui/page/playground/chat.py
+++ b/src/llama_stack/core/ui/page/playground/chat.py
@ -1,134 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import streamlit as st
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-# Sidebar configurations
-with st.sidebar:
-    st.header("Configuration")
-    available_models = llama_stack_api.client.models.list()
-    available_models = [
-        model.id
-        for model in available_models
-        if model.custom_metadata and model.custom_metadata.get("model_type") == "llm"
-    ]
-    selected_model = st.selectbox(
-        "Choose a model",
-        available_models,
-        index=0,
-    )
-
-    temperature = st.slider(
-        "Temperature",
-        min_value=0.0,
-        max_value=1.0,
-        value=0.0,
-        step=0.1,
-        help="Controls the randomness of the response. Higher values make the output more creative and unexpected, lower values make it more conservative and predictable",
-    )
-
-    top_p = st.slider(
-        "Top P",
-        min_value=0.0,
-        max_value=1.0,
-        value=0.95,
-        step=0.1,
-    )
-
-    max_tokens = st.slider(
-        "Max Tokens",
-        min_value=0,
-        max_value=4096,
-        value=512,
-        step=1,
-        help="The maximum number of tokens to generate",
-    )
-
-    repetition_penalty = st.slider(
-        "Repetition Penalty",
-        min_value=1.0,
-        max_value=2.0,
-        value=1.0,
-        step=0.1,
-        help="Controls the likelihood for generating the same word or phrase multiple times in the same sentence or paragraph. 1 implies no penalty, 2 will strongly discourage model to repeat words or phrases.",
-    )
-
-    stream = st.checkbox("Stream", value=True)
-    system_prompt = st.text_area(
-        "System Prompt",
-        value="You are a helpful AI assistant.",
-        help="Initial instructions given to the AI to set its behavior and context",
-    )
-
-    # Add clear chat button to sidebar
-    if st.button("Clear Chat", use_container_width=True):
-        st.session_state.messages = []
-        st.rerun()
-
-
-# Main chat interface
-st.title("🦙 Chat")
-
-
-# Initialize chat history
-if "messages" not in st.session_state:
-    st.session_state.messages = []
-
-# Display chat messages
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.markdown(message["content"])
-
-# Chat input
-if prompt := st.chat_input("Example: What is Llama Stack?"):
-    # Add user message to chat history
-    st.session_state.messages.append({"role": "user", "content": prompt})
-
-    # Display user message
-    with st.chat_message("user"):
-        st.markdown(prompt)
-
-    # Display assistant response
-    with st.chat_message("assistant"):
-        message_placeholder = st.empty()
-        full_response = ""
-
-        if temperature > 0.0:
-            strategy = {
-                "type": "top_p",
-                "temperature": temperature,
-                "top_p": top_p,
-            }
-        else:
-            strategy = {"type": "greedy"}
-
-        response = llama_stack_api.client.inference.chat_completion(
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": prompt},
-            ],
-            model_id=selected_model,
-            stream=stream,
-            sampling_params={
-                "strategy": strategy,
-                "max_tokens": max_tokens,
-                "repetition_penalty": repetition_penalty,
-            },
-        )
-
-        if stream:
-            for chunk in response:
-                if chunk.event.event_type == "progress":
-                    full_response += chunk.event.delta.text
-                message_placeholder.markdown(full_response + "▌")
-            message_placeholder.markdown(full_response)
-        else:
-            full_response = response.completion_message.content
-            message_placeholder.markdown(full_response)
-
-        st.session_state.messages.append({"role": "assistant", "content": full_response})
--- a/src/llama_stack/core/ui/page/playground/tools.py
+++ b/src/llama_stack/core/ui/page/playground/tools.py
@ -1,352 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import enum
-import json
-import uuid
-
-import streamlit as st
-from llama_stack_client import Agent
-from llama_stack_client.lib.agents.react.agent import ReActAgent
-from llama_stack_client.lib.agents.react.tool_parser import ReActOutput
-
-from llama_stack.core.ui.modules.api import llama_stack_api
-
-
-class AgentType(enum.Enum):
-    REGULAR = "Regular"
-    REACT = "ReAct"
-
-
-def tool_chat_page():
-    st.title("🛠 Tools")
-
-    client = llama_stack_api.client
-    models = client.models.list()
-    model_list = [model.identifier for model in models if model.api_model_type == "llm"]
-
-    tool_groups = client.toolgroups.list()
-    tool_groups_list = [tool_group.identifier for tool_group in tool_groups]
-    mcp_tools_list = [tool for tool in tool_groups_list if tool.startswith("mcp::")]
-    builtin_tools_list = [tool for tool in tool_groups_list if not tool.startswith("mcp::")]
-    selected_vector_stores = []
-
-    def reset_agent():
-        st.session_state.clear()
-        st.cache_resource.clear()
-
-    with st.sidebar:
-        st.title("Configuration")
-        st.subheader("Model")
-        model = st.selectbox(label="Model", options=model_list, on_change=reset_agent, label_visibility="collapsed")
-
-        st.subheader("Available ToolGroups")
-
-        toolgroup_selection = st.pills(
-            label="Built-in tools",
-            options=builtin_tools_list,
-            selection_mode="multi",
-            on_change=reset_agent,
-            format_func=lambda tool: "".join(tool.split("::")[1:]),
-            help="List of built-in tools from your llama stack server.",
-        )
-
-        if "builtin::rag" in toolgroup_selection:
-            vector_stores = llama_stack_api.client.vector_stores.list() or []
-            if not vector_stores:
-                st.info("No vector databases available for selection.")
-            vector_stores = [vector_store.identifier for vector_store in vector_stores]
-            selected_vector_stores = st.multiselect(
-                label="Select Document Collections to use in RAG queries",
-                options=vector_stores,
-                on_change=reset_agent,
-            )
-
-        mcp_selection = st.pills(
-            label="MCP Servers",
-            options=mcp_tools_list,
-            selection_mode="multi",
-            on_change=reset_agent,
-            format_func=lambda tool: "".join(tool.split("::")[1:]),
-            help="List of MCP servers registered to your llama stack server.",
-        )
-
-        toolgroup_selection.extend(mcp_selection)
-
-        grouped_tools = {}
-        total_tools = 0
-
-        for toolgroup_id in toolgroup_selection:
-            tools = client.tools.list(toolgroup_id=toolgroup_id)
-            grouped_tools[toolgroup_id] = [tool.name for tool in tools]
-            total_tools += len(tools)
-
-        st.markdown(f"Active Tools: 🛠 {total_tools}")
-
-        for group_id, tools in grouped_tools.items():
-            with st.expander(f"🔧 Tools from `{group_id}`"):
-                for idx, tool in enumerate(tools, start=1):
-                    st.markdown(f"{idx}. `{tool.split(':')[-1]}`")
-
-        st.subheader("Agent Configurations")
-        st.subheader("Agent Type")
-        agent_type = st.radio(
-            label="Select Agent Type",
-            options=["Regular", "ReAct"],
-            on_change=reset_agent,
-        )
-
-        if agent_type == "ReAct":
-            agent_type = AgentType.REACT
-        else:
-            agent_type = AgentType.REGULAR
-
-        max_tokens = st.slider(
-            "Max Tokens",
-            min_value=0,
-            max_value=4096,
-            value=512,
-            step=64,
-            help="The maximum number of tokens to generate",
-            on_change=reset_agent,
-        )
-
-    for i, tool_name in enumerate(toolgroup_selection):
-        if tool_name == "builtin::rag":
-            tool_dict = dict(
-                name="builtin::rag",
-                args={
-                    "vector_store_ids": list(selected_vector_stores),
-                },
-            )
-            toolgroup_selection[i] = tool_dict
-
-    @st.cache_resource
-    def create_agent():
-        if "agent_type" in st.session_state and st.session_state.agent_type == AgentType.REACT:
-            return ReActAgent(
-                client=client,
-                model=model,
-                tools=toolgroup_selection,
-                response_format={
-                    "type": "json_schema",
-                    "json_schema": ReActOutput.model_json_schema(),
-                },
-                sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
-            )
-        else:
-            return Agent(
-                client,
-                model=model,
-                instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
-                tools=toolgroup_selection,
-                sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
-            )
-
-    st.session_state.agent_type = agent_type
-
-    agent = create_agent()
-
-    if "agent_session_id" not in st.session_state:
-        st.session_state["agent_session_id"] = agent.create_session(session_name=f"tool_demo_{uuid.uuid4()}")
-
-    session_id = st.session_state["agent_session_id"]
-
-    if "messages" not in st.session_state:
-        st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}]
-
-    for msg in st.session_state.messages:
-        with st.chat_message(msg["role"]):
-            st.markdown(msg["content"])
-
-    if prompt := st.chat_input(placeholder=""):
-        with st.chat_message("user"):
-            st.markdown(prompt)
-
-        st.session_state.messages.append({"role": "user", "content": prompt})
-
-        turn_response = agent.create_turn(
-            session_id=session_id,
-            messages=[{"role": "user", "content": prompt}],
-            stream=True,
-        )
-
-        def response_generator(turn_response):
-            if st.session_state.get("agent_type") == AgentType.REACT:
-                return _handle_react_response(turn_response)
-            else:
-                return _handle_regular_response(turn_response)
-
-        def _handle_react_response(turn_response):
-            current_step_content = ""
-            final_answer = None
-            tool_results = []
-
-            for response in turn_response:
-                if not hasattr(response.event, "payload"):
-                    yield (
-                        "\n\n🚨 :red[_Llama Stack server Error:_]\n"
-                        "The response received is missing an expected `payload` attribute.\n"
-                        "This could indicate a malformed response or an internal issue within the server.\n\n"
-                        f"Error details: {response}"
-                    )
-                    return
-
-                payload = response.event.payload
-
-                if payload.event_type == "step_progress" and hasattr(payload.delta, "text"):
-                    current_step_content += payload.delta.text
-                    continue
-
-                if payload.event_type == "step_complete":
-                    step_details = payload.step_details
-
-                    if step_details.step_type == "inference":
-                        yield from _process_inference_step(current_step_content, tool_results, final_answer)
-                        current_step_content = ""
-                    elif step_details.step_type == "tool_execution":
-                        tool_results = _process_tool_execution(step_details, tool_results)
-                        current_step_content = ""
-                    else:
-                        current_step_content = ""
-
-            if not final_answer and tool_results:
-                yield from _format_tool_results_summary(tool_results)
-
-        def _process_inference_step(current_step_content, tool_results, final_answer):
-            try:
-                react_output_data = json.loads(current_step_content)
-                thought = react_output_data.get("thought")
-                action = react_output_data.get("action")
-                answer = react_output_data.get("answer")
-
-                if answer and answer != "null" and answer is not None:
-                    final_answer = answer
-
-                if thought:
-                    with st.expander("🤔 Thinking...", expanded=False):
-                        st.markdown(f":grey[__{thought}__]")
-
-                if action and isinstance(action, dict):
-                    tool_name = action.get("tool_name")
-                    tool_params = action.get("tool_params")
-                    with st.expander(f'🛠 Action: Using tool "{tool_name}"', expanded=False):
-                        st.json(tool_params)
-
-                if answer and answer != "null" and answer is not None:
-                    yield f"\n\n✅ **Final Answer:**\n{answer}"
-
-            except json.JSONDecodeError:
-                yield f"\n\nFailed to parse ReAct step content:\n```json\n{current_step_content}\n```"
-            except Exception as e:
-                yield f"\n\nFailed to process ReAct step: {e}\n```json\n{current_step_content}\n```"
-
-            return final_answer
-
-        def _process_tool_execution(step_details, tool_results):
-            try:
-                if hasattr(step_details, "tool_responses") and step_details.tool_responses:
-                    for tool_response in step_details.tool_responses:
-                        tool_name = tool_response.tool_name
-                        content = tool_response.content
-                        tool_results.append((tool_name, content))
-                        with st.expander(f'⚙️ Observation (Result from "{tool_name}")', expanded=False):
-                            try:
-                                parsed_content = json.loads(content)
-                                st.json(parsed_content)
-                            except json.JSONDecodeError:
-                                st.code(content, language=None)
-                else:
-                    with st.expander("⚙️ Observation", expanded=False):
-                        st.markdown(":grey[_Tool execution step completed, but no response data found._]")
-            except Exception as e:
-                with st.expander("⚙️ Error in Tool Execution", expanded=False):
-                    st.markdown(f":red[_Error processing tool execution: {str(e)}_]")
-
-            return tool_results
-
-        def _format_tool_results_summary(tool_results):
-            yield "\n\n**Here's what I found:**\n"
-            for tool_name, content in tool_results:
-                try:
-                    parsed_content = json.loads(content)
-
-                    if tool_name == "web_search" and "top_k" in parsed_content:
-                        yield from _format_web_search_results(parsed_content)
-                    elif "results" in parsed_content and isinstance(parsed_content["results"], list):
-                        yield from _format_results_list(parsed_content["results"])
-                    elif isinstance(parsed_content, dict) and len(parsed_content) > 0:
-                        yield from _format_dict_results(parsed_content)
-                    elif isinstance(parsed_content, list) and len(parsed_content) > 0:
-                        yield from _format_list_results(parsed_content)
-                except json.JSONDecodeError:
-                    yield f"\n**{tool_name}** was used but returned complex data. Check the observation for details.\n"
-                except (TypeError, AttributeError, KeyError, IndexError) as e:
-                    print(f"Error processing {tool_name} result: {type(e).__name__}: {e}")
-
-        def _format_web_search_results(parsed_content):
-            for i, result in enumerate(parsed_content["top_k"], 1):
-                if i <= 3:
-                    title = result.get("title", "Untitled")
-                    url = result.get("url", "")
-                    content_text = result.get("content", "").strip()
-                    yield f"\n- **{title}**\n  {content_text}\n  [Source]({url})\n"
-
-        def _format_results_list(results):
-            for i, result in enumerate(results, 1):
-                if i <= 3:
-                    if isinstance(result, dict):
-                        name = result.get("name", result.get("title", "Result " + str(i)))
-                        description = result.get("description", result.get("content", result.get("summary", "")))
-                        yield f"\n- **{name}**\n  {description}\n"
-                    else:
-                        yield f"\n- {result}\n"
-
-        def _format_dict_results(parsed_content):
-            yield "\n```\n"
-            for key, value in list(parsed_content.items())[:5]:
-                if isinstance(value, str) and len(value) < 100:
-                    yield f"{key}: {value}\n"
-                else:
-                    yield f"{key}: [Complex data]\n"
-            yield "```\n"
-
-        def _format_list_results(parsed_content):
-            yield "\n"
-            for _, item in enumerate(parsed_content[:3], 1):
-                if isinstance(item, str):
-                    yield f"- {item}\n"
-                elif isinstance(item, dict) and "text" in item:
-                    yield f"- {item['text']}\n"
-                elif isinstance(item, dict) and len(item) > 0:
-                    first_value = next(iter(item.values()))
-                    if isinstance(first_value, str) and len(first_value) < 100:
-                        yield f"- {first_value}\n"
-
-        def _handle_regular_response(turn_response):
-            for response in turn_response:
-                if hasattr(response.event, "payload"):
-                    print(response.event.payload)
-                    if response.event.payload.event_type == "step_progress":
-                        if hasattr(response.event.payload.delta, "text"):
-                            yield response.event.payload.delta.text
-                    if response.event.payload.event_type == "step_complete":
-                        if response.event.payload.step_details.step_type == "tool_execution":
-                            if response.event.payload.step_details.tool_calls:
-                                tool_name = str(response.event.payload.step_details.tool_calls[0].tool_name)
-                                yield f'\n\n🛠 :grey[_Using "{tool_name}" tool:_]\n\n'
-                            else:
-                                yield "No tool_calls present in step_details"
-                else:
-                    yield f"Error occurred in the Llama Stack Cluster: {response}"
-
-        with st.chat_message("assistant"):
-            response_content = st.write_stream(response_generator(turn_response))
-
-        st.session_state.messages.append({"role": "assistant", "content": response_content})
-
-
-tool_chat_page()
--- a/src/llama_stack/core/ui/requirements.txt
+++ b/src/llama_stack/core/ui/requirements.txt
@ -1,5 +0,0 @@
-llama-stack>=0.2.1
-llama-stack-client>=0.2.1
-pandas
-streamlit
-streamlit-option-menu
--- a/src/llama_stack/core/utils/config_resolution.py
+++ b/src/llama_stack/core/utils/config_resolution.py
@ -52,7 +52,17 @@ def resolve_config_or_distro(
            logger.debug(f"Using distribution: {distro_config}")
            return distro_config

-    # Strategy 3: Try as built distribution name
+    # Strategy 3: Try as distro config path (if no .yaml extension and contains a slash)
+    # eg: starter::run-with-postgres-store.yaml
+    # Use :: to avoid slash and confusion with a filesystem path
+    if "::" in config_or_distro:
+        distro_name, config_name = config_or_distro.split("::")
+        distro_config = _get_distro_config_path(distro_name, config_name)
+        if distro_config.exists():
+            logger.info(f"Using distribution: {distro_config}")
+            return distro_config
+
+    # Strategy 4: Try as built distribution name
    distrib_config = DISTRIBS_BASE_DIR / f"llamastack-{config_or_distro}" / f"{config_or_distro}-{mode}.yaml"
    if distrib_config.exists():
        logger.debug(f"Using built distribution: {distrib_config}")
@ -63,13 +73,15 @@ def resolve_config_or_distro(
        logger.debug(f"Using built distribution: {distrib_config}")
        return distrib_config

-    # Strategy 4: Failed - provide helpful error
+    # Strategy 5: Failed - provide helpful error
    raise ValueError(_format_resolution_error(config_or_distro, mode))


-def _get_distro_config_path(distro_name: str, mode: Mode) -> Path:
+def _get_distro_config_path(distro_name: str, mode: str) -> Path:
    """Get the config file path for a distro."""
-    return DISTRO_DIR / distro_name / f"{mode}.yaml"
+    if not mode.endswith(".yaml"):
+        mode = f"{mode}.yaml"
+    return DISTRO_DIR / distro_name / mode


 def _format_resolution_error(config_or_distro: str, mode: Mode) -> str:
--- a/src/llama_stack/core/utils/exec.py
+++ b/src/llama_stack/core/utils/exec.py
@ -84,6 +84,15 @@ def run_command(command: list[str]) -> int:
            text=True,
            check=False,
        )
+
+        # Print stdout and stderr if command failed
+        if result.returncode != 0:
+            log.error(f"Command {' '.join(command)} failed with returncode {result.returncode}")
+            if result.stdout:
+                log.error(f"STDOUT: {result.stdout}")
+            if result.stderr:
+                log.error(f"STDERR: {result.stderr}")
+
        return result.returncode
    except subprocess.SubprocessError as e:
        log.error(f"Subprocess error: {e}")
--- a/src/llama_stack/distributions/ci-tests/build.yaml
+++ b/src/llama_stack/distributions/ci-tests/build.yaml
@ -56,4 +56,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
+- psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/ci-tests/ci_tests.py
+++ b/src/llama_stack/distributions/ci-tests/ci_tests.py
@ -13,5 +13,6 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
 def get_distribution_template() -> DistributionTemplate:
    template = get_starter_distribution_template(name="ci-tests")
    template.description = "CI tests for Llama Stack"
+    template.run_configs.pop("run-with-postgres-store.yaml", None)

    return template
--- a/src/llama_stack/distributions/ci-tests/run.yaml
+++ b/src/llama_stack/distributions/ci-tests/run.yaml
@ -46,6 +46,9 @@ providers:
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
+    config:
+      api_key: ${env.AWS_BEDROCK_API_KEY:=}
+      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
--- a/src/llama_stack/distributions/postgres-demo/init.py
+++ b/src/llama_stack/distributions/postgres-demo/init.py
@ -1,7 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .postgres_demo import get_distribution_template  # noqa: F401
--- a/src/llama_stack/distributions/postgres-demo/build.yaml
+++ b/src/llama_stack/distributions/postgres-demo/build.yaml
@ -1,23 +0,0 @@
-version: 2
-distribution_spec:
-  description: Quick start template for running Llama Stack with several popular providers
-  providers:
-    inference:
-    - provider_type: remote::vllm
-    - provider_type: inline::sentence-transformers
-    vector_io:
-    - provider_type: remote::chromadb
-    safety:
-    - provider_type: inline::llama-guard
-    agents:
-    - provider_type: inline::meta-reference
-    tool_runtime:
-    - provider_type: remote::brave-search
-    - provider_type: remote::tavily-search
-    - provider_type: inline::rag-runtime
-    - provider_type: remote::model-context-protocol
-image_type: venv
-additional_pip_packages:
- asyncpg
- psycopg2-binary
- sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/postgres-demo/postgres_demo.py
+++ b/src/llama_stack/distributions/postgres-demo/postgres_demo.py
@ -1,125 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack.apis.models import ModelType
-from llama_stack.core.datatypes import (
-    BuildProvider,
-    ModelInput,
-    Provider,
-    ShieldInput,
-    ToolGroupInput,
-)
-from llama_stack.distributions.template import (
-    DistributionTemplate,
-    RunConfigSettings,
-)
-from llama_stack.providers.inline.inference.sentence_transformers import SentenceTransformersInferenceConfig
-from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig
-from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
-from llama_stack.providers.utils.kvstore.config import PostgresKVStoreConfig
-from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig
-
-
-def get_distribution_template() -> DistributionTemplate:
-    inference_providers = [
-        Provider(
-            provider_id="vllm-inference",
-            provider_type="remote::vllm",
-            config=VLLMInferenceAdapterConfig.sample_run_config(
-                url="${env.VLLM_URL:=http://localhost:8000/v1}",
-            ),
-        ),
-    ]
-    providers = {
-        "inference": [
-            BuildProvider(provider_type="remote::vllm"),
-            BuildProvider(provider_type="inline::sentence-transformers"),
-        ],
-        "vector_io": [BuildProvider(provider_type="remote::chromadb")],
-        "safety": [BuildProvider(provider_type="inline::llama-guard")],
-        "agents": [BuildProvider(provider_type="inline::meta-reference")],
-        "tool_runtime": [
-            BuildProvider(provider_type="remote::brave-search"),
-            BuildProvider(provider_type="remote::tavily-search"),
-            BuildProvider(provider_type="inline::rag-runtime"),
-            BuildProvider(provider_type="remote::model-context-protocol"),
-        ],
-    }
-    name = "postgres-demo"
-
-    vector_io_providers = [
-        Provider(
-            provider_id="${env.ENABLE_CHROMADB:+chromadb}",
-            provider_type="remote::chromadb",
-            config=ChromaVectorIOConfig.sample_run_config(
-                f"~/.llama/distributions/{name}",
-                url="${env.CHROMADB_URL:=}",
-            ),
-        ),
-    ]
-    default_tool_groups = [
-        ToolGroupInput(
-            toolgroup_id="builtin::websearch",
-            provider_id="tavily-search",
-        ),
-        ToolGroupInput(
-            toolgroup_id="builtin::rag",
-            provider_id="rag-runtime",
-        ),
-    ]
-
-    default_models = [
-        ModelInput(
-            model_id="${env.INFERENCE_MODEL}",
-            provider_id="vllm-inference",
-        )
-    ]
-    embedding_provider = Provider(
-        provider_id="sentence-transformers",
-        provider_type="inline::sentence-transformers",
-        config=SentenceTransformersInferenceConfig.sample_run_config(),
-    )
-    embedding_model = ModelInput(
-        model_id="nomic-embed-text-v1.5",
-        provider_id=embedding_provider.provider_id,
-        model_type=ModelType.embedding,
-        metadata={
-            "embedding_dimension": 768,
-        },
-    )
-    return DistributionTemplate(
-        name=name,
-        distro_type="self_hosted",
-        description="Quick start template for running Llama Stack with several popular providers",
-        container_image=None,
-        template_path=None,
-        providers=providers,
-        available_models_by_provider={},
-        run_configs={
-            "run.yaml": RunConfigSettings(
-                provider_overrides={
-                    "inference": inference_providers + [embedding_provider],
-                    "vector_io": vector_io_providers,
-                },
-                default_models=default_models + [embedding_model],
-                default_tool_groups=default_tool_groups,
-                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-                storage_backends={
-                    "kv_default": PostgresKVStoreConfig.sample_run_config(
-                        table_name="llamastack_kvstore",
-                    ),
-                    "sql_default": PostgresSqlStoreConfig.sample_run_config(),
-                },
-            ),
-        },
-        run_config_env_vars={
-            "LLAMA_STACK_PORT": (
-                "8321",
-                "Port for the Llama Stack distribution server",
-            ),
-        },
-    )
--- a/src/llama_stack/distributions/starter-gpu/build.yaml
+++ b/src/llama_stack/distributions/starter-gpu/build.yaml
@ -57,4 +57,5 @@ image_type: venv
 additional_pip_packages:
 - aiosqlite
 - asyncpg
+- psycopg2-binary
 - sqlalchemy[asyncio]
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@ -0,0 +1,284 @@
+version: 2
+image_name: starter-gpu
+apis:
+- agents
+- batches
+- datasetio
+- eval
+- files
+- inference
+- post_training
+- safety
+- scoring
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
+    provider_type: remote::cerebras
+    config:
+      base_url: https://api.cerebras.ai
+      api_key: ${env.CEREBRAS_API_KEY:=}
+  - provider_id: ${env.OLLAMA_URL:+ollama}
+    provider_type: remote::ollama
+    config:
+      url: ${env.OLLAMA_URL:=http://localhost:11434}
+  - provider_id: ${env.VLLM_URL:+vllm}
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:=}
+      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+      api_token: ${env.VLLM_API_TOKEN:=fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+  - provider_id: ${env.TGI_URL:+tgi}
+    provider_type: remote::tgi
+    config:
+      url: ${env.TGI_URL:=}
+  - provider_id: fireworks
+    provider_type: remote::fireworks
+    config:
+      url: https://api.fireworks.ai/inference/v1
+      api_key: ${env.FIREWORKS_API_KEY:=}
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: ${env.TOGETHER_API_KEY:=}
+  - provider_id: bedrock
+    provider_type: remote::bedrock
+    config:
+      api_key: ${env.AWS_BEDROCK_API_KEY:=}
+      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
+  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
+    provider_type: remote::nvidia
+    config:
+      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      api_key: ${env.NVIDIA_API_KEY:=}
+      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
+  - provider_id: openai
+    provider_type: remote::openai
+    config:
+      api_key: ${env.OPENAI_API_KEY:=}
+      base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
+  - provider_id: anthropic
+    provider_type: remote::anthropic
+    config:
+      api_key: ${env.ANTHROPIC_API_KEY:=}
+  - provider_id: gemini
+    provider_type: remote::gemini
+    config:
+      api_key: ${env.GEMINI_API_KEY:=}
+  - provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
+    provider_type: remote::vertexai
+    config:
+      project: ${env.VERTEX_AI_PROJECT:=}
+      location: ${env.VERTEX_AI_LOCATION:=us-central1}
+  - provider_id: groq
+    provider_type: remote::groq
+    config:
+      url: https://api.groq.com
+      api_key: ${env.GROQ_API_KEY:=}
+  - provider_id: sambanova
+    provider_type: remote::sambanova
+    config:
+      url: https://api.sambanova.ai/v1
+      api_key: ${env.SAMBANOVA_API_KEY:=}
+  - provider_id: ${env.AZURE_API_KEY:+azure}
+    provider_type: remote::azure
+    config:
+      api_key: ${env.AZURE_API_KEY:=}
+      api_base: ${env.AZURE_API_BASE:=}
+      api_version: ${env.AZURE_API_VERSION:=}
+      api_type: ${env.AZURE_API_TYPE:=}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      persistence:
+        namespace: vector_io::faiss
+        backend: kv_default
+  - provider_id: sqlite-vec
+    provider_type: inline::sqlite-vec
+    config:
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
+      persistence:
+        namespace: vector_io::sqlite_vec
+        backend: kv_default
+  - provider_id: ${env.MILVUS_URL:+milvus}
+    provider_type: inline::milvus
+    config:
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
+      persistence:
+        namespace: vector_io::milvus
+        backend: kv_default
+  - provider_id: ${env.CHROMADB_URL:+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:=}
+      persistence:
+        namespace: vector_io::chroma_remote
+        backend: kv_default
+  - provider_id: ${env.PGVECTOR_DB:+pgvector}
+    provider_type: remote::pgvector
+    config:
+      host: ${env.PGVECTOR_HOST:=localhost}
+      port: ${env.PGVECTOR_PORT:=5432}
+      db: ${env.PGVECTOR_DB:=}
+      user: ${env.PGVECTOR_USER:=}
+      password: ${env.PGVECTOR_PASSWORD:=}
+      persistence:
+        namespace: vector_io::pgvector
+        backend: kv_default
+  - provider_id: ${env.QDRANT_URL:+qdrant}
+    provider_type: remote::qdrant
+    config:
+      api_key: ${env.QDRANT_API_KEY:=}
+      persistence:
+        namespace: vector_io::qdrant_remote
+        backend: kv_default
+  - provider_id: ${env.WEAVIATE_CLUSTER_URL:+weaviate}
+    provider_type: remote::weaviate
+    config:
+      weaviate_api_key: null
+      weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
+      persistence:
+        namespace: vector_io::weaviate
+        backend: kv_default
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
+      metadata_store:
+        table_name: files_metadata
+        backend: sql_default
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  - provider_id: code-scanner
+    provider_type: inline::code-scanner
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sql_postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+      responses_store:
+        type: sql_postgres
+        host: ${env.POSTGRES_HOST:=localhost}
+        port: ${env.POSTGRES_PORT:=5432}
+        db: ${env.POSTGRES_DB:=llamastack}
+        user: ${env.POSTGRES_USER:=llamastack}
+        password: ${env.POSTGRES_PASSWORD:=llamastack}
+  post_training:
+  - provider_id: huggingface-gpu
+    provider_type: inline::huggingface-gpu
+    config:
+      checkpoint_format: huggingface
+      distributed_backend: null
+      device: cpu
+      dpo_output_dir: ~/.llama/distributions/starter-gpu/dpo_output
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        namespace: eval
+        backend: kv_default
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        namespace: datasetio::huggingface
+        backend: kv_default
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        namespace: datasetio::localfs
+        backend: kv_default
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:=}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+  batches:
+  - provider_id: reference
+    provider_type: inline::reference
+    config:
+      kvstore:
+        namespace: batches
+        backend: kv_postgres
+storage:
+  backends:
+    kv_postgres:
+      type: kv_postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+      table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
+    sql_postgres:
+      type: sql_postgres
+      host: ${env.POSTGRES_HOST:=localhost}
+      port: ${env.POSTGRES_PORT:=5432}
+      db: ${env.POSTGRES_DB:=llamastack}
+      user: ${env.POSTGRES_USER:=llamastack}
+      password: ${env.POSTGRES_PASSWORD:=llamastack}
+  stores:
+    metadata:
+      namespace: registry
+      backend: kv_postgres
+    inference:
+      table_name: inference_store
+      backend: sql_postgres
+      max_write_queue_size: 10000
+      num_writers: 4
+    conversations:
+      table_name: openai_conversations
+      backend: sql_postgres
+    prompts:
+      namespace: prompts
+      backend: kv_postgres
+registered_resources:
+  models: []
+  shields: []
+  vector_dbs: []
+  datasets: []
+  scoring_fns: []
+  benchmarks: []
+  tool_groups: []
+server:
+  port: 8321
+telemetry:
+  enabled: true
--- a/src/llama_stack/distributions/starter-gpu/run.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run.yaml
@ -46,6 +46,9 @@ providers:
      api_key: ${env.TOGETHER_API_KEY:=}
  - provider_id: bedrock
    provider_type: remote::bedrock
+    config:
+      api_key: ${env.AWS_BEDROCK_API_KEY:=}
+      region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
    provider_type: remote::nvidia
    config:
--- a/Show more
+++ b/Show more