Merge branch 'llamastack:main' into model_unregisteration_error_message

2025-12-13 23:42:37 +00:00 · 2025-10-06 10:14:30 -07:00 · 2025-10-06 10:14:30 -07:00 · aa09a44c94
commit aa09a44c94
parent d5d2061c8c d23ed26238
1036 changed files with 314835 additions and 114394 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -2,4 +2,4 @@

 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
-* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
+* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -2,7 +2,7 @@ blank_issues_enabled: false

 contact_links:
  - name: Have you read the docs?
-    url: https://llamastack.github.io/latest/providers/external/index.html
+    url: https://llamastack.github.io/providers/external/index.html
    about: Much help can be found in the docs
  - name: Start a discussion
    url: https://github.com/llamastack/llama-stack/discussions/new/
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +1 @@
 # This file documents Triage members in the Llama Stack community
- @franciscojavierarceo
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@ -12,6 +12,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
 | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
 | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
+| Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
 | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
 | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
--- a/.github/workflows/conformance.yml
+++ b/.github/workflows/conformance.yml
@ -1,6 +1,11 @@
 # API Conformance Tests
 # This workflow ensures that API changes maintain backward compatibility and don't break existing integrations
 # It runs schema validation and OpenAPI diff checks to catch breaking changes early
+#
+# The workflow handles both monolithic and split API specifications:
+# - If split specs exist (stable/experimental/deprecated), they are stitched together for comparison
+# - If only monolithic spec exists, it is used directly
+# This allows for clean API organization while maintaining robust conformance testing

 name: API Conformance Tests

@ -11,11 +16,14 @@ on:
    branches: [ main ]
  pull_request:
    branches: [ main ]
-    types: [opened, synchronize, reopened]
+    types: [opened, synchronize, reopened, edited]
    paths:
-      - 'docs/static/llama-stack-spec.yaml'
-      - 'docs/static/llama-stack-spec.html'
-      - '.github/workflows/conformance.yml' # This workflow itself
+      - 'docs/static/llama-stack-spec.yaml'              # Legacy monolithic spec
+      - 'docs/static/stable-llama-stack-spec.yaml'       # Stable APIs spec
+      - 'docs/static/experimental-llama-stack-spec.yaml' # Experimental APIs spec
+      - 'docs/static/deprecated-llama-stack-spec.yaml'   # Deprecated APIs spec
+      - 'docs/static/llama-stack-spec.html'              # Legacy HTML spec
+      - '.github/workflows/conformance.yml'              # This workflow itself

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -27,14 +35,31 @@ jobs:
  check-schema-compatibility:
    runs-on: ubuntu-latest
    steps:
-      # Using specific version 4.1.7 because 5.0.0 fails when trying to run this locally using `act`
-      # This ensures consistent behavior between local testing and CI
      - name: Checkout PR Code
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          fetch-depth: 0

+      # Check if we should skip conformance testing due to breaking changes
+      - name: Check if conformance test should be skipped
+        id: skip-check
+        run: |
+          PR_TITLE="${{ github.event.pull_request.title }}"
+
+          # Skip if title contains "!:" indicating breaking change (like "feat!:")
+          if [[ "$PR_TITLE" == *"!:"* ]]; then
+            echo "skip=true" >> $GITHUB_OUTPUT
+            exit 0
+          fi
+
+          # Get all commits in this PR and check for BREAKING CHANGE footer
+          git log --format="%B" ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }} | \
+            grep -q "BREAKING CHANGE:" && echo "skip=true" >> $GITHUB_OUTPUT || echo "skip=false" >> $GITHUB_OUTPUT
+        shell: bash
      # Checkout the base branch to compare against (usually main)
      # This allows us to diff the current changes against the previous state
      - name: Checkout Base Branch
+        if: steps.skip-check.outputs.skip != 'true'
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          ref: ${{ github.event.pull_request.base.ref }}
@ -42,6 +67,7 @@ jobs:

      # Cache oasdiff to avoid checksum failures and speed up builds
      - name: Cache oasdiff
+        if: steps.skip-check.outputs.skip != 'true'
        id: cache-oasdiff
        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830
        with:
@ -50,20 +76,69 @@ jobs:

      # Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
      - name: Install oasdiff
-        if: steps.cache-oasdiff.outputs.cache-hit != 'true'
+        if: steps.skip-check.outputs.skip != 'true' && steps.cache-oasdiff.outputs.cache-hit != 'true'
        run: |
          curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
          cp /usr/local/bin/oasdiff ~/oasdiff

      # Setup cached oasdiff
      - name: Setup cached oasdiff
-        if: steps.cache-oasdiff.outputs.cache-hit == 'true'
+        if: steps.skip-check.outputs.skip != 'true' && steps.cache-oasdiff.outputs.cache-hit == 'true'
        run: |
          sudo cp ~/oasdiff /usr/local/bin/oasdiff
          sudo chmod +x /usr/local/bin/oasdiff

+      # Install yq for YAML processing
+      - name: Install yq
+        run: |
+          sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64
+          sudo chmod +x /usr/local/bin/yq
+
+      # Verify API specs exist for conformance testing
+      - name: Check API Specs
+        if: steps.skip-check.outputs.skip != 'true'
+        run: |
+          echo "Checking for API specification files..."
+
+          # Check current branch
+          if [ -f "docs/static/stable-llama-stack-spec.yaml" ]; then
+            echo "✓ Found stable API spec in current branch"
+            CURRENT_SPEC="docs/static/stable-llama-stack-spec.yaml"
+          elif [ -f "docs/static/llama-stack-spec.yaml" ]; then
+            echo "✓ Found monolithic API spec in current branch"
+            CURRENT_SPEC="docs/static/llama-stack-spec.yaml"
+          else
+            echo "❌ No API specs found in current branch"
+            exit 1
+          fi
+
+          # Check base branch
+          if [ -f "base/docs/static/stable-llama-stack-spec.yaml" ]; then
+            echo "✓ Found stable API spec in base branch"
+            BASE_SPEC="base/docs/static/stable-llama-stack-spec.yaml"
+          elif [ -f "base/docs/static/llama-stack-spec.yaml" ]; then
+            echo "✓ Found monolithic API spec in base branch"
+            BASE_SPEC="base/docs/static/llama-stack-spec.yaml"
+          else
+            echo "❌ No API specs found in base branch"
+            exit 1
+          fi
+
+          # Export for next step
+          echo "BASE_SPEC=${BASE_SPEC}" >> $GITHUB_ENV
+          echo "CURRENT_SPEC=${CURRENT_SPEC}" >> $GITHUB_ENV
+
+          echo "Will compare: ${BASE_SPEC} -> ${CURRENT_SPEC}"
+
      # Run oasdiff to detect breaking changes in the API specification
      # This step will fail if incompatible changes are detected, preventing breaking changes from being merged
      - name: Run OpenAPI Breaking Change Diff
+        if: steps.skip-check.outputs.skip != 'true'
        run: |
-          oasdiff breaking --fail-on ERR base/docs/static/llama-stack-spec.yaml docs/static/llama-stack-spec.yaml --match-path '^/v1/'
+          oasdiff breaking --fail-on ERR $BASE_SPEC $CURRENT_SPEC --match-path '^/v1/'
+
+      # Report when test is skipped
+      - name: Report skip reason
+        if: steps.skip-check.outputs.skip == 'true'
+        run: |
+          echo "Conformance test skipped due to breaking change indicator"
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -84,6 +84,8 @@ jobs:
          yq eval '.server.auth.provider_config.jwks.token = "${{ env.TOKEN }}"' -i $run_dir/run.yaml
          cat $run_dir/run.yaml

+          # avoid line breaks in the server log, especially because we grep it below.
+          export COLUMNS=1984
          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &

      - name: Wait for Llama Stack server to be ready
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -42,18 +42,27 @@ jobs:

  run-replay-mode-tests:
    runs-on: ubuntu-latest
-    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}
+    name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}

    strategy:
      fail-fast: false
      matrix:
        client-type: [library, server]
-        # Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
-        setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
        # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
        python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
        client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
-        suite: [base, vision]
+        # Define (setup, suite) pairs - they are always matched and cannot be independent
+        # Weekly schedule (Sun 1 AM): vllm+base
+        # Input test-setup=ollama-vision: ollama-vision+vision
+        # Default (including test-setup=ollama): both ollama+base and ollama-vision+vision
+        config: >-
+          ${{
+            github.event.schedule == '1 0 * * 0'
+              && fromJSON('[{"setup": "vllm", "suite": "base"}]')
+            || github.event.inputs.test-setup == 'ollama-vision'
+              && fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
+            || fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}]')
+          }}

    steps:
      - name: Checkout repository
@ -64,14 +73,14 @@ jobs:
        with:
          python-version: ${{ matrix.python-version }}
          client-version: ${{ matrix.client-version }}
-          setup: ${{ matrix.setup }}
-          suite: ${{ matrix.suite }}
+          setup: ${{ matrix.config.setup }}
+          suite: ${{ matrix.config.suite }}
          inference-mode: 'replay'

      - name: Run tests
        uses: ./.github/actions/run-and-record-tests
        with:
          stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
-          setup: ${{ matrix.setup }}
+          setup: ${{ matrix.config.setup }}
          inference-mode: 'replay'
-          suite: ${{ matrix.suite }}
+          suite: ${{ matrix.config.suite }}
--- a/.github/workflows/precommit-trigger.yml
+++ b/.github/workflows/precommit-trigger.yml
@ -0,0 +1,227 @@
+name: Pre-commit Bot
+
+run-name: Pre-commit bot for PR #${{ github.event.issue.number }}
+
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  pre-commit:
+    # Only run on pull request comments
+    if: github.event.issue.pull_request && contains(github.event.comment.body, '@github-actions run precommit')
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Check comment author and get PR details
+        id: check_author
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            // Get PR details
+            const pr = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: context.issue.number
+            });
+
+            // Check if commenter has write access or is the PR author
+            const commenter = context.payload.comment.user.login;
+            const prAuthor = pr.data.user.login;
+
+            let hasPermission = false;
+
+            // Check if commenter is PR author
+            if (commenter === prAuthor) {
+              hasPermission = true;
+              console.log(`Comment author ${commenter} is the PR author`);
+            } else {
+              // Check if commenter has write/admin access
+              try {
+                const permission = await github.rest.repos.getCollaboratorPermissionLevel({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  username: commenter
+                });
+
+                const level = permission.data.permission;
+                hasPermission = ['write', 'admin', 'maintain'].includes(level);
+                console.log(`Comment author ${commenter} has permission: ${level}`);
+              } catch (error) {
+                console.log(`Could not check permissions for ${commenter}: ${error.message}`);
+              }
+            }
+
+            if (!hasPermission) {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: `❌ @${commenter} You don't have permission to trigger pre-commit. Only PR authors or repository collaborators can run this command.`
+              });
+              core.setFailed(`User ${commenter} does not have permission`);
+              return;
+            }
+
+            // Save PR info for later steps
+            core.setOutput('pr_number', context.issue.number);
+            core.setOutput('pr_head_ref', pr.data.head.ref);
+            core.setOutput('pr_head_sha', pr.data.head.sha);
+            core.setOutput('pr_head_repo', pr.data.head.repo.full_name);
+            core.setOutput('pr_base_ref', pr.data.base.ref);
+            core.setOutput('is_fork', pr.data.head.repo.full_name !== context.payload.repository.full_name);
+            core.setOutput('authorized', 'true');
+
+      - name: React to comment
+        if: steps.check_author.outputs.authorized == 'true'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: context.payload.comment.id,
+              content: 'rocket'
+            });
+
+      - name: Comment starting
+        if: steps.check_author.outputs.authorized == 'true'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: ${{ steps.check_author.outputs.pr_number }},
+              body: `⏳ Running pre-commit hooks on PR #${{ steps.check_author.outputs.pr_number }}...`
+            });
+
+      - name: Checkout PR branch (same-repo)
+        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'false'
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          ref: ${{ steps.check_author.outputs.pr_head_ref }}
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout PR branch (fork)
+        if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'true'
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: ${{ steps.check_author.outputs.pr_head_repo }}
+          ref: ${{ steps.check_author.outputs.pr_head_ref }}
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Verify checkout
+        if: steps.check_author.outputs.authorized == 'true'
+        run: |
+          echo "Current SHA: $(git rev-parse HEAD)"
+          echo "Expected SHA: ${{ steps.check_author.outputs.pr_head_sha }}"
+          if [[ "$(git rev-parse HEAD)" != "${{ steps.check_author.outputs.pr_head_sha }}" ]]; then
+            echo "::error::Checked out SHA does not match expected SHA"
+            exit 1
+          fi
+
+      - name: Set up Python
+        if: steps.check_author.outputs.authorized == 'true'
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: '3.12'
+          cache: pip
+          cache-dependency-path: |
+            **/requirements*.txt
+            .pre-commit-config.yaml
+
+      - name: Set up Node.js
+        if: steps.check_author.outputs.authorized == 'true'
+        uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
+        with:
+          node-version: '20'
+          cache: 'npm'
+          cache-dependency-path: 'llama_stack/ui/'
+
+      - name: Install npm dependencies
+        if: steps.check_author.outputs.authorized == 'true'
+        run: npm ci
+        working-directory: llama_stack/ui
+
+      - name: Run pre-commit
+        if: steps.check_author.outputs.authorized == 'true'
+        id: precommit
+        uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+        continue-on-error: true
+        env:
+          SKIP: no-commit-to-branch
+          RUFF_OUTPUT_FORMAT: github
+
+      - name: Check for changes
+        if: steps.check_author.outputs.authorized == 'true'
+        id: changes
+        run: |
+          if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
+            echo "has_changes=true" >> $GITHUB_OUTPUT
+            echo "Changes detected after pre-commit"
+          else
+            echo "has_changes=false" >> $GITHUB_OUTPUT
+            echo "No changes after pre-commit"
+          fi
+
+      - name: Commit and push changes
+        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
+        run: |
+          git config --local user.email "github-actions[bot]@users.noreply.github.com"
+          git config --local user.name "github-actions[bot]"
+
+          git add -A
+          git commit -m "style: apply pre-commit fixes
+
+          🤖 Applied by @github-actions bot via pre-commit workflow"
+
+          # Push changes
+          git push origin HEAD:${{ steps.check_author.outputs.pr_head_ref }}
+
+      - name: Comment success with changes
+        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: ${{ steps.check_author.outputs.pr_number }},
+              body: `✅ Pre-commit hooks completed successfully!\n\n🔧 Changes have been committed and pushed to the PR branch.`
+            });
+
+      - name: Comment success without changes
+        if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'false' && steps.precommit.outcome == 'success'
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: ${{ steps.check_author.outputs.pr_number }},
+              body: `✅ Pre-commit hooks passed!\n\n✨ No changes needed - your code is already formatted correctly.`
+            });
+
+      - name: Comment failure
+        if: failure()
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: ${{ steps.check_author.outputs.pr_number }},
+              body: `❌ Pre-commit workflow failed!\n\nPlease check the [workflow logs](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) for details.`
+            });
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -112,7 +112,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
@ -150,7 +150,7 @@ jobs:
          fi
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
-          if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
+          if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
--- a/.github/workflows/python-build-test.yml
+++ b/.github/workflows/python-build-test.yml
@ -24,7 +24,7 @@ jobs:
      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0

    - name: Install uv
-      uses: astral-sh/setup-uv@b75a909f75acd358c2196fb9a5f1299a9a8868a4 # v6.7.0
+      uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e # v6.8.0
      with:
        python-version: ${{ matrix.python-version }}
        activate-environment: true
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -61,7 +61,7 @@ Before pushing your changes, make sure that the pre-commit hooks have passed suc

 We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).

-If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
+If in doubt, please open a [discussion](https://github.com/llamastack/llama-stack/discussions); we can always convert that to an issue later.

 ### Issues
 We use GitHub issues to track public bugs. Please ensure your description is
@ -165,8 +165,8 @@ Building a stack image will use the production version of the `llama-stack` and
 Example:
 ```bash
 cd work/
-git clone https://github.com/meta-llama/llama-stack.git
-git clone https://github.com/meta-llama/llama-stack-client-python.git
+git clone https://github.com/llamastack/llama-stack.git
+git clone https://github.com/llamastack/llama-stack-client-python.git
 cd llama-stack
 LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
 ```
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)

-[**Quick Start**](https://llamastack.github.io/latest/getting_started/index.html) | [**Documentation**](https://llamastack.github.io/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
+[**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)


 ### ✨🎉 Llama 4 Support  🎉✨
@ -120,7 +120,7 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on

 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
-Please checkout for [full list](https://llamastack.github.io/latest/providers/index.html)
+Please checkout for [full list](https://llamastack.github.io/docs/providers)

 | API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
 |:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
@ -151,7 +151,7 @@ Please checkout for [full list](https://llamastack.github.io/latest/providers/in
 |     NVIDIA NEMO      | Hosted | | ✅ | ✅ | | | ✅ | ✅ | ✅ |
 |        NVIDIA        | Hosted | | | | | | ✅ | ✅ | ✅ |

-> **Note**: Additional providers are available through external packages. See [External Providers](https://llamastack.github.io/latest/providers/external/index.html) documentation.
+> **Note**: Additional providers are available through external packages. See [External Providers](https://llamastack.github.io/docs/providers/external) documentation.

 ### Distributions

--- a/docs/docs/api-overview.md
+++ b/docs/docs/api-overview.md
@ -0,0 +1,49 @@
+# API Reference Overview
+
+The Llama Stack provides a comprehensive set of APIs organized by stability level to help you choose the right endpoints for your use case.
+
+## 🟢 Stable APIs
+
+**Production-ready APIs with backward compatibility guarantees.**
+
+These APIs are fully tested, documented, and stable. They follow semantic versioning principles and maintain backward compatibility within major versions. Recommended for production applications.
+
+[**Browse Stable APIs →**](./api/llama-stack-specification)
+
+**Key Features:**
+- ✅ Backward compatibility guaranteed
+- ✅ Comprehensive testing and validation
+- ✅ Production-ready reliability
+- ✅ Long-term support
+
+---
+
+## 🟡 Experimental APIs
+
+**Preview APIs that may change before becoming stable.**
+
+These APIs include v1alpha and v1beta endpoints that are feature-complete but may undergo changes based on feedback. Great for exploring new capabilities and providing feedback.
+
+[**Browse Experimental APIs →**](./api-experimental/llama-stack-specification-experimental-apis)
+
+**Key Features:**
+- 🧪 Latest features and capabilities
+- 🧪 May change based on user feedback
+- 🧪 Active development and iteration
+- 🧪 Opportunity to influence final design
+
+---
+
+## 🔴 Deprecated APIs
+
+**Legacy APIs for migration reference.**
+
+These APIs are deprecated and will be removed in future versions. They are provided for migration purposes and to help transition to newer, stable alternatives.
+
+[**Browse Deprecated APIs →**](./api-deprecated/llama-stack-specification-deprecated-apis)
+
+**Key Features:**
+- ⚠️ Will be removed in future versions
+- ⚠️ Migration guidance provided
+- ⚠️ Use for compatibility during transition
+- ⚠️ Not recommended for new projects
--- a/docs/docs/building_applications/telemetry.mdx
+++ b/docs/docs/building_applications/telemetry.mdx
@ -187,21 +187,21 @@ Configure telemetry behavior using environment variables:
 - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
 - **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)

-## Visualization with Jaeger
+### Quick Setup: Complete Telemetry Stack

-The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector.
-
-### Starting Jaeger
-
-Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686:
+Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):

 ```bash
-docker run --pull always --rm --name jaeger \
-  -p 16686:16686 -p 4318:4318 \
-  jaegertracing/jaeger:2.1.0
+./scripts/telemetry/setup_telemetry.sh
 ```

-Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/).
+This sets up:
+- **Jaeger UI**: http://localhost:16686 (traces visualization)
+- **Prometheus**: http://localhost:9090 (metrics)
+- **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
+- **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
+
+Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.

 ## Querying Metrics

--- a/docs/docs/building_applications/tools.mdx
+++ b/docs/docs/building_applications/tools.mdx
@ -181,7 +181,7 @@ Once defined, simply pass the tool to the agent config. `Agent` will take care o
 agent = Agent(client, ..., tools=[my_tool])
 ```

-Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/e2e_loop_with_client_tools.py) for an example of how to use client provided tools.
+Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/) for an example of how to use client provided tools.

 ## Tool Invocation

--- a/docs/docs/concepts/apis/external.mdx
+++ b/docs/docs/concepts/apis/external.mdx
@ -152,7 +152,6 @@ __all__ = ["WeatherAPI", "available_providers"]
 from typing import Protocol

 from llama_stack.providers.datatypes import (
-    AdapterSpec,
    Api,
    ProviderSpec,
    RemoteProviderSpec,
@ -166,12 +165,10 @@ def available_providers() -> list[ProviderSpec]:
            api=Api.weather,
            provider_type="remote::kaze",
            config_class="llama_stack_provider_kaze.KazeProviderConfig",
-            adapter=AdapterSpec(
-                adapter_type="kaze",
-                module="llama_stack_provider_kaze",
-                pip_packages=["llama_stack_provider_kaze"],
-                config_class="llama_stack_provider_kaze.KazeProviderConfig",
-            ),
+            adapter_type="kaze",
+            module="llama_stack_provider_kaze",
+            pip_packages=["llama_stack_provider_kaze"],
+            config_class="llama_stack_provider_kaze.KazeProviderConfig",
        ),
    ]

@ -325,11 +322,10 @@ class WeatherKazeAdapter(WeatherProvider):

 ```yaml
 # ~/.llama/providers.d/remote/weather/kaze.yaml
-adapter:
-  adapter_type: kaze
-  pip_packages: ["llama_stack_provider_kaze"]
-  config_class: llama_stack_provider_kaze.config.KazeProviderConfig
-  module: llama_stack_provider_kaze
+adapter_type: kaze
+pip_packages: ["llama_stack_provider_kaze"]
+config_class: llama_stack_provider_kaze.config.KazeProviderConfig
+module: llama_stack_provider_kaze
 optional_api_dependencies: []
 ```

@ -361,7 +357,7 @@ server:
 8. Run the server:

 ```bash
-python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
+llama stack run ~/.llama/run-byoa.yaml
 ```

 9. Test the API:
--- a/docs/docs/deploying/kubernetes_deployment.mdx
+++ b/docs/docs/deploying/kubernetes_deployment.mdx
@ -170,7 +170,7 @@ spec:
      - name: llama-stack
        image: localhost/llama-stack-run-k8s:latest
        imagePullPolicy: IfNotPresent
-        command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"]
+        command: ["llama", "stack", "run", "/app/config.yaml"]
        ports:
          - containerPort: 5000
        volumeMounts:
--- a/docs/docs/distributions/configuration.mdx
+++ b/docs/docs/distributions/configuration.mdx
@ -509,16 +509,16 @@ server:
    provider_config:
      type: "github_token"
      github_api_base_url: "https://api.github.com"
-  access_policy:
-  - permit:
-      principal: user-1
-      actions: [create, read, delete]
-    description: user-1 has full access to all resources
-  - permit:
-      principal: user-2
-      actions: [read]
-      resource: model::model-1
-    description: user-2 has read access to model-1 only
+    access_policy:
+    - permit:
+        principal: user-1
+        actions: [create, read, delete]
+      description: user-1 has full access to all resources
+    - permit:
+        principal: user-2
+        actions: [read]
+        resource: model::model-1
+      description: user-2 has read access to model-1 only
 ```

 Similarly, the following restricts access to particular kubernetes
--- a/docs/docs/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/docs/distributions/k8s/stack-k8s.yaml.template
@ -52,7 +52,7 @@ spec:
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
-        command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
+        command: ["llama", "stack", "run", "/etc/config/stack_run_config.yaml", "--port", "8321"]
        ports:
          - containerPort: 8321
        volumeMounts:
--- a/docs/docs/distributions/list_of_distributions.mdx
+++ b/docs/docs/distributions/list_of_distributions.mdx
@ -131,4 +131,4 @@ graph TD
 3. **Configure your providers** with API keys or local models
 4. **Start building** with Llama Stack!

-For help choosing or troubleshooting, check our [Getting Started Guide](/docs/getting_started/quickstart) or [Community Support](https://github.com/llama-stack/llama-stack/discussions).
+For help choosing or troubleshooting, check our [Getting Started Guide](/docs/getting_started/quickstart) or [Community Support](https://github.com/llamastack/llama-stack/discussions).
--- a/docs/docs/distributions/self_hosted_distro/dell.md
+++ b/docs/docs/distributions/self_hosted_distro/dell.md
@ -102,7 +102,7 @@ You can start a chroma-db easily using docker.
 # This is where the indices are persisted
 mkdir -p $HOME/chromadb

-podman run --rm -it \
+docker run --rm -it \
  --network host \
  --name chromadb \
  -v $HOME/chromadb:/chroma/chroma \
@ -127,7 +127,7 @@ docker run -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
  # NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed
-  -v /home/hjshah/git/llama-stack:/app/llama-stack-source -v /home/hjshah/git/llama-models:/app/llama-models-source \
+  -v $HOME/git/llama-stack:/app/llama-stack-source -v $HOME/git/llama-models:/app/llama-models-source \
  # localhost/distribution-dell:dev if building / testing locally
  llamastack/distribution-dell\
  --port $LLAMA_STACK_PORT  \
--- a/docs/docs/index.mdx
+++ b/docs/docs/index.mdx
@ -14,13 +14,13 @@ Llama Stack is the open-source framework for building generative AI applications

 :::tip Llama 4 is here!

-Check out [Getting Started with Llama 4](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started_llama4.ipynb)
+Check out [Getting Started with Llama 4](https://colab.research.google.com/github/llamastack/llama-stack/blob/main/docs/getting_started_llama4.ipynb)

 :::

 :::tip News

-Llama Stack is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases) for more details.
+Llama Stack is now available! See the [release notes](https://github.com/llamastack/llama-stack/releases) for more details.

 :::

@ -45,7 +45,7 @@ Llama Stack consists of a server (with multiple pluggable API providers) and Cli

 ## Quick Links

- Ready to build? Check out the [Getting Started Guide](https://llama-stack.github.io/getting_started/quickstart) to get started.
+- Ready to build? Check out the [Getting Started Guide](/docs/getting_started/quickstart) to get started.
 - Want to contribute? See the [Contributing Guide](https://github.com/llamastack/llama-stack/blob/main/CONTRIBUTING.md).
 - Explore [Example Applications](https://github.com/llamastack/llama-stack-apps) built with Llama Stack.

@ -59,13 +59,13 @@ Llama Stack provides adapters for popular providers across all API categories:
 - **Training & Evaluation**: HuggingFace, TorchTune, NVIDIA NEMO

 :::info Provider Details
-For complete provider compatibility and setup instructions, see our [Providers Documentation](https://llamastack.github.io/providers/).
+For complete provider compatibility and setup instructions, see our [Providers Documentation](https://llamastack.github.io/docs/providers/).
 :::

 ## Get Started Today

 <div style={{display: 'flex', gap: '1rem', flexWrap: 'wrap', margin: '2rem 0'}}>
-  <a href="https://llama-stack.github.io/getting_started/quickstart"
+  <a href="/docs/getting_started/quickstart"
     style={{
       background: 'var(--ifm-color-primary)',
       color: 'white',
--- a/docs/docs/providers/agents/index.mdx
+++ b/docs/docs/providers/agents/index.mdx
@ -1,12 +1,7 @@
 ---
-description: "Agents API for creating and interacting with agentic systems.
+description: "Agents

-    Main functionalities provided by this API:
-    - Create agents with specific instructions and ability to use tools.
-    - Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".
-    - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
-    - Agents can be provided with various shields (see the Safety API for more details).
-    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details."
+    APIs for creating and interacting with agentic systems."
 sidebar_label: Agents
 title: Agents
 ---
@ -15,13 +10,8 @@ title: Agents

 ## Overview

-Agents API for creating and interacting with agentic systems.
+Agents

-    Main functionalities provided by this API:
-    - Create agents with specific instructions and ability to use tools.
-    - Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
-    - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
-    - Agents can be provided with various shields (see the Safety API for more details).
-    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
+    APIs for creating and interacting with agentic systems.

 This section contains documentation for all available providers for the **agents** API.
--- a/docs/docs/providers/external/external-providers-guide.mdx
+++ b/docs/docs/providers/external/external-providers-guide.mdx
@ -11,38 +11,6 @@ an example entry in your build.yaml should look like:
  module: ramalama_stack
 ```

-Additionally you can configure the `external_providers_dir` in your Llama Stack configuration. This method is in the process of being deprecated in favor of the `module` method. If using this method, the external provider directory should contain your external provider specifications:
-
-```yaml
-external_providers_dir: ~/.llama/providers.d/
-```
-
-## Directory Structure
-
-The external providers directory should follow this structure:
-
-```
-providers.d/
-  remote/
-    inference/
-      custom_ollama.yaml
-      vllm.yaml
-    vector_io/
-      qdrant.yaml
-    safety/
-      llama-guard.yaml
-  inline/
-    inference/
-      custom_ollama.yaml
-      vllm.yaml
-    vector_io/
-      qdrant.yaml
-    safety/
-      llama-guard.yaml
-```
-
-Each YAML file in these directories defines a provider specification for that particular API.
-
 ## Provider Types

 Llama Stack supports two types of external providers:
@ -50,30 +18,37 @@ Llama Stack supports two types of external providers:
 1. **Remote Providers**: Providers that communicate with external services (e.g., cloud APIs)
 2. **Inline Providers**: Providers that run locally within the Llama Stack process

+
+### Provider Specification (Common between inline and remote providers)
+
+- `provider_type`: The type of the provider to be installed (remote or inline). eg. `remote::ollama`
+- `api`: The API for this provider, eg. `inference`
+- `config_class`: The full path to the configuration class
+- `module`: The Python module containing the provider implementation
+- `optional_api_dependencies`: List of optional Llama Stack APIs that this provider can use
+- `api_dependencies`:  List of Llama Stack APIs that this provider depends on
+- `provider_data_validator`: Optional validator for provider data.
+- `pip_packages`: List of Python packages required by the provider
+
 ### Remote Provider Specification

 Remote providers are used when you need to communicate with external services. Here's an example for a custom Ollama provider:

 ```yaml
-adapter:
-  adapter_type: custom_ollama
-  pip_packages:
-  - ollama
-  - aiohttp
-  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
-  module: llama_stack_ollama_provider
+adapter_type: custom_ollama
+provider_type: "remote::ollama"
+pip_packages:
+- ollama
+- aiohttp
+config_class: llama_stack_ollama_provider.config.OllamaImplConfig
+module: llama_stack_ollama_provider
 api_dependencies: []
 optional_api_dependencies: []
 ```

-#### Adapter Configuration
+#### Remote Provider Configuration

-The `adapter` section defines how to load and configure the provider:
-
- `adapter_type`: A unique identifier for this adapter
- `pip_packages`: List of Python packages required by the provider
- `config_class`: The full path to the configuration class
- `module`: The Python module containing the provider implementation
+- `adapter_type`: A unique identifier for this adapter, eg. `ollama`

 ### Inline Provider Specification

@ -81,6 +56,7 @@ Inline providers run locally within the Llama Stack process. Here's an example f

 ```yaml
 module: llama_stack_vector_provider
+provider_type: inline::llama_stack_vector_provider
 config_class: llama_stack_vector_provider.config.VectorStoreConfig
 pip_packages:
  - faiss-cpu
@ -95,12 +71,6 @@ container_image: custom-vector-store:latest  # optional

 #### Inline Provider Fields

- `module`: The Python module containing the provider implementation
- `config_class`: The full path to the configuration class
- `pip_packages`: List of Python packages required by the provider
- `api_dependencies`: List of Llama Stack APIs that this provider depends on
- `optional_api_dependencies`: List of optional Llama Stack APIs that this provider can use
- `provider_data_validator`: Optional validator for provider data
 - `container_image`: Optional container image to use instead of pip packages

 ## Required Fields
@ -113,20 +83,17 @@ All providers must contain a `get_provider_spec` function in their `provider` mo
 from llama_stack.providers.datatypes import (
    ProviderSpec,
    Api,
-    AdapterSpec,
-    remote_provider_spec,
+    RemoteProviderSpec,
 )


 def get_provider_spec() -> ProviderSpec:
-    return remote_provider_spec(
+    return RemoteProviderSpec(
        api=Api.inference,
-        adapter=AdapterSpec(
-            adapter_type="ramalama",
-            pip_packages=["ramalama>=0.8.5", "pymilvus"],
-            config_class="ramalama_stack.config.RamalamaImplConfig",
-            module="ramalama_stack",
-        ),
+        adapter_type="ramalama",
+        pip_packages=["ramalama>=0.8.5", "pymilvus"],
+        config_class="ramalama_stack.config.RamalamaImplConfig",
+        module="ramalama_stack",
    )
 ```

@ -197,18 +164,16 @@ information. Execute the test for the Provider type you are developing.
 If your external provider isn't being loaded:

 1. Check that `module` points to a published pip package with a top level `provider` module including `get_provider_spec`.
-1. Check that the `external_providers_dir` path is correct and accessible.
 2. Verify that the YAML files are properly formatted.
 3. Ensure all required Python packages are installed.
 4. Check the Llama Stack server logs for any error messages - turn on debug logging to get more
   information using `LLAMA_STACK_LOGGING=all=debug`.
-5. Verify that the provider package is installed in your Python environment if using `external_providers_dir`.

 ## Examples

-### Example using `external_providers_dir`: Custom Ollama Provider
+### How to create an external provider module

-Here's a complete example of creating and using a custom Ollama provider:
+If you are creating a new external provider called `llama-stack-provider-ollama` here is how you would set up the package properly:

 1. First, create the provider package:

@ -230,33 +195,28 @@ requires-python = ">=3.12"
 dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
 ```

-3. Create the provider specification:
-
-```yaml
-# ~/.llama/providers.d/remote/inference/custom_ollama.yaml
-adapter:
-  adapter_type: custom_ollama
-  pip_packages: ["ollama", "aiohttp"]
-  config_class: llama_stack_provider_ollama.config.OllamaImplConfig
-  module: llama_stack_provider_ollama
-api_dependencies: []
-optional_api_dependencies: []
-```
-
-4. Install the provider:
+3. Install the provider:

 ```bash
 uv pip install -e .
 ```

-5. Configure Llama Stack to use external providers:
+4. Edit `provider.py`

-```yaml
-external_providers_dir: ~/.llama/providers.d/
+provider.py must be updated to contain `get_provider_spec`. This is used by llama stack to install the provider.
+
+```python
+def get_provider_spec() -> ProviderSpec:
+    return RemoteProviderSpec(
+        api=Api.inference,
+        adapter_type="llama-stack-provider-ollama",
+        pip_packages=["ollama", "aiohttp"],
+        config_class="llama_stack_provider_ollama.config.OllamaImplConfig",
+        module="llama_stack_provider_ollama",
+    )
 ```

-The provider will now be available in Llama Stack with the type `remote::custom_ollama`.
-
+5. Implement the provider as outlined above with `get_provider_impl` or `get_adapter_impl`, etc.

 ### Example using `module`: ramalama-stack

@ -275,7 +235,6 @@ distribution_spec:
      module: ramalama_stack==0.3.0a0
 image_type: venv
 image_name: null
-external_providers_dir: null
 additional_pip_packages:
 - aiosqlite
 - sqlalchemy[asyncio]
--- a/docs/docs/providers/inference/remote_anthropic.mdx
+++ b/docs/docs/providers/inference/remote_anthropic.mdx
@ -14,6 +14,7 @@ Anthropic inference provider for accessing Claude models and Anthropic's AI serv

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `api_key` | `str \| None` | No |  | API key for Anthropic models |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_azure.mdx
+++ b/docs/docs/providers/inference/remote_azure.mdx
@ -21,6 +21,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `api_key` | `<class 'pydantic.types.SecretStr'>` | No |  | Azure API key for Azure |
 | `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No |  | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
 | `api_version` | `str \| None` | No |  | Azure API version for Azure (e.g., 2024-12-01-preview) |
--- a/docs/docs/providers/inference/remote_bedrock.mdx
+++ b/docs/docs/providers/inference/remote_bedrock.mdx
@ -14,6 +14,7 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `aws_access_key_id` | `str \| None` | No |  | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
 | `aws_secret_access_key` | `str \| None` | No |  | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
 | `aws_session_token` | `str \| None` | No |  | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
--- a/docs/docs/providers/inference/remote_cerebras.mdx
+++ b/docs/docs/providers/inference/remote_cerebras.mdx
@ -14,6 +14,7 @@ Cerebras inference provider for running models on Cerebras Cloud platform.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `base_url` | `<class 'str'>` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
 | `api_key` | `<class 'pydantic.types.SecretStr'>` | No |  | Cerebras API Key |

--- a/docs/docs/providers/inference/remote_databricks.mdx
+++ b/docs/docs/providers/inference/remote_databricks.mdx
@ -14,7 +14,8 @@ Databricks inference provider for running models on Databricks' unified analytic

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
-| `url` | `<class 'str'>` | No |  | The URL for the Databricks model serving endpoint |
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
+| `url` | `str \| None` | No |  | The URL for the Databricks model serving endpoint |
 | `api_token` | `<class 'pydantic.types.SecretStr'>` | No |  | The Databricks API token |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_gemini.mdx
+++ b/docs/docs/providers/inference/remote_gemini.mdx
@ -14,6 +14,7 @@ Google Gemini inference provider for accessing Gemini models and Google's AI ser

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `api_key` | `str \| None` | No |  | API key for Gemini models |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_groq.mdx
+++ b/docs/docs/providers/inference/remote_groq.mdx
@ -14,6 +14,7 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `api_key` | `str \| None` | No |  | The Groq API key |
 | `url` | `<class 'str'>` | No | https://api.groq.com | The URL for the Groq AI server |

--- a/docs/docs/providers/inference/remote_llama-openai-compat.mdx
+++ b/docs/docs/providers/inference/remote_llama-openai-compat.mdx
@ -14,6 +14,7 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `api_key` | `str \| None` | No |  | The Llama API key |
 | `openai_compat_api_base` | `<class 'str'>` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |

--- a/docs/docs/providers/inference/remote_nvidia.mdx
+++ b/docs/docs/providers/inference/remote_nvidia.mdx
@ -14,6 +14,7 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
 | `api_key` | `pydantic.types.SecretStr \| None` | No |  | The NVIDIA API key, only needed of using the hosted service |
 | `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
--- a/docs/docs/providers/inference/remote_ollama.mdx
+++ b/docs/docs/providers/inference/remote_ollama.mdx
@ -14,6 +14,7 @@ Ollama inference provider for running local models through the Ollama runtime.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `url` | `<class 'str'>` | No | http://localhost:11434 |  |
 | `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |

--- a/docs/docs/providers/inference/remote_openai.mdx
+++ b/docs/docs/providers/inference/remote_openai.mdx
@ -14,6 +14,7 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `api_key` | `str \| None` | No |  | API key for OpenAI models |
 | `base_url` | `<class 'str'>` | No | https://api.openai.com/v1 | Base URL for OpenAI API |

--- a/docs/docs/providers/inference/remote_passthrough.mdx
+++ b/docs/docs/providers/inference/remote_passthrough.mdx
@ -14,6 +14,7 @@ Passthrough inference provider for connecting to any external inference service

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `url` | `<class 'str'>` | No |  | The URL for the passthrough endpoint |
 | `api_key` | `pydantic.types.SecretStr \| None` | No |  | API Key for the passthrouth endpoint |

--- a/docs/docs/providers/inference/remote_runpod.mdx
+++ b/docs/docs/providers/inference/remote_runpod.mdx
@ -14,6 +14,7 @@ RunPod inference provider for running models on RunPod's cloud GPU platform.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `url` | `str \| None` | No |  | The URL for the Runpod model serving endpoint |
 | `api_token` | `str \| None` | No |  | The API token |

--- a/docs/docs/providers/inference/remote_sambanova.mdx
+++ b/docs/docs/providers/inference/remote_sambanova.mdx
@ -14,6 +14,7 @@ SambaNova inference provider for running models on SambaNova's dataflow architec

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
 | `api_key` | `pydantic.types.SecretStr \| None` | No |  | The SambaNova cloud API Key |

--- a/docs/docs/providers/inference/remote_tgi.mdx
+++ b/docs/docs/providers/inference/remote_tgi.mdx
@ -14,6 +14,7 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `url` | `<class 'str'>` | No |  | The URL for the TGI serving endpoint |

 ## Sample Configuration
--- a/docs/docs/providers/inference/remote_vertexai.mdx
+++ b/docs/docs/providers/inference/remote_vertexai.mdx
@ -53,6 +53,7 @@ Available Models:

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `project` | `<class 'str'>` | No |  | Google Cloud project ID for Vertex AI |
 | `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |

--- a/docs/docs/providers/inference/remote_vllm.mdx
+++ b/docs/docs/providers/inference/remote_vllm.mdx
@ -14,6 +14,7 @@ Remote vLLM inference provider for connecting to vLLM servers.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `url` | `str \| None` | No |  | The URL for the vLLM model serving endpoint |
 | `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
 | `api_token` | `str \| None` | No | fake | The API token |
--- a/docs/docs/providers/inference/remote_watsonx.mdx
+++ b/docs/docs/providers/inference/remote_watsonx.mdx
@ -14,6 +14,7 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
 | `api_key` | `pydantic.types.SecretStr \| None` | No |  | The watsonx API key |
 | `project_id` | `str \| None` | No |  | The Project ID key |
--- a/docs/docs/providers/safety/remote_bedrock.mdx
+++ b/docs/docs/providers/safety/remote_bedrock.mdx
@ -14,6 +14,7 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.

 | Field | Type | Required | Default | Description |
 |-------|------|----------|---------|-------------|
+| `allowed_models` | `list[str \| None` | No |  | List of models that should be registered with the model registry. If None, all models are allowed. |
 | `aws_access_key_id` | `str \| None` | No |  | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
 | `aws_secret_access_key` | `str \| None` | No |  | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
 | `aws_session_token` | `str \| None` | No |  | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
--- a/docs/docs/providers/telemetry/inline_meta-reference.mdx
+++ b/docs/docs/providers/telemetry/inline_meta-reference.mdx
@ -16,14 +16,14 @@ Meta's reference implementation of telemetry and observability using OpenTelemet
 |-------|------|----------|---------|-------------|
 | `otel_exporter_otlp_endpoint` | `str \| None` | No |  | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
 | `service_name` | `<class 'str'>` | No |  | The service name to use for telemetry |
-| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [&lt;TelemetrySink.CONSOLE: 'console'&gt;, &lt;TelemetrySink.SQLITE: 'sqlite'&gt;] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
+| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [&lt;TelemetrySink.SQLITE: 'sqlite'&gt;] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) |
 | `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces |

 ## Sample Configuration

 ```yaml
 service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
+sinks: ${env.TELEMETRY_SINKS:=sqlite}
 sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/trace_store.db
 otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
 ```
--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@ -15,6 +15,50 @@ const config: Config = {
  onBrokenMarkdownLinks: "warn",
  favicon: "img/favicon.ico",

+  // Enhanced favicon and meta configuration
+  headTags: [
+    {
+      tagName: 'link',
+      attributes: {
+        rel: 'icon',
+        type: 'image/png',
+        sizes: '32x32',
+        href: '/img/favicon-32x32.png',
+      },
+    },
+    {
+      tagName: 'link',
+      attributes: {
+        rel: 'icon',
+        type: 'image/png',
+        sizes: '16x16',
+        href: '/img/favicon-16x16.png',
+      },
+    },
+    {
+      tagName: 'link',
+      attributes: {
+        rel: 'apple-touch-icon',
+        sizes: '180x180',
+        href: '/img/llama-stack-logo.png',
+      },
+    },
+    {
+      tagName: 'meta',
+      attributes: {
+        name: 'theme-color',
+        content: '#7C3AED', // Purple color from your logo
+      },
+    },
+    {
+      tagName: 'link',
+      attributes: {
+        rel: 'manifest',
+        href: '/site.webmanifest',
+      },
+    },
+  ],
+
  // GitHub pages deployment config.
  organizationName: 'reluctantfuturist',
  projectName: 'llama-stack',
@ -26,9 +70,6 @@ const config: Config = {
      {
        docs: {
          sidebarPath: require.resolve("./sidebars.ts"),
-          // Please change this to your repo.
-          // Remove this to remove the "edit this page" links.
-          editUrl: 'https://github.com/meta-llama/llama-stack/tree/main/docs/',
          docItemComponent: "@theme/ApiItem", // Derived from docusaurus-theme-openapi
        },
        blog: false,
@ -55,10 +96,27 @@ const config: Config = {
          label: 'Docs',
        },
        {
-          type: 'docSidebar',
-          sidebarId: 'apiSidebar',
-          position: 'left',
+          type: 'dropdown',
          label: 'API Reference',
+          position: 'left',
+          to: '/docs/api-overview',
+          items: [
+            {
+              type: 'docSidebar',
+              sidebarId: 'stableApiSidebar',
+              label: '🟢 Stable APIs',
+            },
+            {
+              type: 'docSidebar',
+              sidebarId: 'experimentalApiSidebar',
+              label: '🟡 Experimental APIs',
+            },
+            {
+              type: 'docSidebar',
+              sidebarId: 'deprecatedApiSidebar',
+              label: '🔴 Deprecated APIs',
+            },
+          ],
        },
        {
          href: 'https://github.com/llamastack/llama-stack',
@ -83,7 +141,7 @@ const config: Config = {
            },
            {
              label: 'API Reference',
-              to: '/docs/api/llama-stack-specification',
+              to: '/docs/api-overview',
            },
          ],
        },
@ -170,7 +228,7 @@ const config: Config = {
        id: "openapi",
        docsPluginId: "classic",
        config: {
-          llamastack: {
+          stable: {
            specPath: "static/llama-stack-spec.yaml",
            outputDir: "docs/api",
            downloadUrl: "https://raw.githubusercontent.com/meta-llama/llama-stack/main/docs/static/llama-stack-spec.yaml",
@ -179,6 +237,24 @@ const config: Config = {
              categoryLinkSource: "tag",
            },
          } satisfies OpenApiPlugin.Options,
+          experimental: {
+            specPath: "static/experimental-llama-stack-spec.yaml",
+            outputDir: "docs/api-experimental",
+            downloadUrl: "https://raw.githubusercontent.com/meta-llama/llama-stack/main/docs/static/experimental-llama-stack-spec.yaml",
+            sidebarOptions: {
+              groupPathsBy: "tag",
+              categoryLinkSource: "tag",
+            },
+          } satisfies OpenApiPlugin.Options,
+          deprecated: {
+            specPath: "static/deprecated-llama-stack-spec.yaml",
+            outputDir: "docs/api-deprecated",
+            downloadUrl: "https://raw.githubusercontent.com/meta-llama/llama-stack/main/docs/static/deprecated-llama-stack-spec.yaml",
+            sidebarOptions: {
+              groupPathsBy: "tag",
+              categoryLinkSource: "tag",
+            },
+          } satisfies OpenApiPlugin.Options,
        } satisfies Plugin.PluginOptions,
      },
    ],
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -34,40 +34,59 @@ def str_presenter(dumper, data):
    return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)


-def main(output_dir: str):
-    output_dir = Path(output_dir)
-    if not output_dir.exists():
-        raise ValueError(f"Directory {output_dir} does not exist")
+def generate_spec(output_dir: Path, stability_filter: str = None, main_spec: bool = False, combined_spec: bool = False):
+    """Generate OpenAPI spec with optional stability filtering."""

-    # Validate API protocols before generating spec
-    return_type_errors = validate_api()
-    if return_type_errors:
-        print("\nAPI Method Return Type Validation Errors:\n")
-        for error in return_type_errors:
-            print(error, file=sys.stderr)
-        sys.exit(1)
-    now = str(datetime.now())
-    print(
-        "Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at " + now
-    )
-    print("")
+    if combined_spec:
+        # Special case for combined stable + experimental APIs
+        title_suffix = " - Stable & Experimental APIs"
+        filename_prefix = "stainless-"
+        description_suffix = "\n\n**🔗 COMBINED**: This specification includes both stable production-ready APIs and experimental pre-release APIs. Use stable APIs for production deployments and experimental APIs for testing new features."
+        # Use the special "stainless" filter to include stable + experimental APIs
+        stability_filter = "stainless"
+    elif stability_filter:
+        title_suffix = {
+            "stable": " - Stable APIs" if not main_spec else "",
+            "experimental": " - Experimental APIs",
+            "deprecated": " - Deprecated APIs"
+        }.get(stability_filter, f" - {stability_filter.title()} APIs")
+
+        # Use main spec filename for stable when main_spec=True
+        if main_spec and stability_filter == "stable":
+            filename_prefix = ""
+        else:
+            filename_prefix = f"{stability_filter}-"
+
+        description_suffix = {
+            "stable": "\n\n**✅ STABLE**: Production-ready APIs with backward compatibility guarantees.",
+            "experimental": "\n\n**🧪 EXPERIMENTAL**: Pre-release APIs (v1alpha, v1beta) that may change before becoming stable.",
+            "deprecated": "\n\n**⚠️ DEPRECATED**: Legacy APIs that may be removed in future versions. Use for migration reference only."
+        }.get(stability_filter, "")
+    else:
+        title_suffix = ""
+        filename_prefix = ""
+        description_suffix = ""

    spec = Specification(
        LlamaStack,
        Options(
            server=Server(url="http://any-hosted-llama-stack.com"),
            info=Info(
-                title="Llama Stack Specification",
+                title=f"Llama Stack Specification{title_suffix}",
                version=LLAMA_STACK_API_V1,
-                description="""This is the specification of the Llama Stack that provides
+                description=f"""This is the specification of the Llama Stack that provides
                a set of endpoints and their corresponding interfaces that are tailored to
-                best leverage Llama Models.""",
+                best leverage Llama Models.{description_suffix}""",
            ),
            include_standard_error_responses=True,
+            stability_filter=stability_filter,  # Pass the filter to the generator
        ),
    )

-    with open(output_dir / "llama-stack-spec.yaml", "w", encoding="utf-8") as fp:
+    yaml_filename = f"{filename_prefix}llama-stack-spec.yaml"
+    html_filename = f"{filename_prefix}llama-stack-spec.html"
+
+    with open(output_dir / yaml_filename, "w", encoding="utf-8") as fp:
        y = yaml.YAML()
        y.default_flow_style = False
        y.block_seq_indent = 2
@ -83,9 +102,39 @@ def main(output_dir: str):
            fp,
        )

-    with open(output_dir / "llama-stack-spec.html", "w") as fp:
+    with open(output_dir / html_filename, "w") as fp:
        spec.write_html(fp, pretty_print=True)

+    print(f"Generated {yaml_filename} and {html_filename}")
+
+def main(output_dir: str):
+    output_dir = Path(output_dir)
+    if not output_dir.exists():
+        raise ValueError(f"Directory {output_dir} does not exist")
+
+    # Validate API protocols before generating spec
+    return_type_errors = validate_api()
+    if return_type_errors:
+        print("\nAPI Method Return Type Validation Errors:\n")
+        for error in return_type_errors:
+            print(error, file=sys.stderr)
+        sys.exit(1)
+
+    now = str(datetime.now())
+    print(f"Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at {now}")
+    print("")
+
+    # Generate main spec as stable APIs (llama-stack-spec.yaml)
+    print("Generating main specification (stable APIs)...")
+    generate_spec(output_dir, "stable", main_spec=True)
+
+    print("Generating other stability-filtered specifications...")
+    generate_spec(output_dir, "experimental")
+    generate_spec(output_dir, "deprecated")
+
+    print("Generating combined stable + experimental specification...")
+    generate_spec(output_dir, combined_spec=True)
+

 if __name__ == "__main__":
    fire.Fire(main)
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -7,13 +7,14 @@
 import hashlib
 import inspect
 import ipaddress
+import os
 import types
 import typing
 from dataclasses import make_dataclass
+from pathlib import Path
 from typing import Annotated, Any, Dict, get_args, get_origin, Set, Union

 from fastapi import UploadFile
-from pydantic import BaseModel

 from llama_stack.apis.datatypes import Error
 from llama_stack.strong_typing.core import JsonType
@ -35,6 +36,7 @@ from llama_stack.strong_typing.schema import (
    SchemaOptions,
 )
 from llama_stack.strong_typing.serialization import json_dump_string, object_to_json
+from pydantic import BaseModel

 from .operations import (
    EndpointOperation,
@ -48,6 +50,7 @@ from .specification import (
    Document,
    Example,
    ExampleRef,
+    ExtraBodyParameter,
    MediaType,
    Operation,
    Parameter,
@ -546,6 +549,84 @@ class Generator:

        return extra_tags

+    def _get_api_group_for_operation(self, op) -> str | None:
+        """
+        Determine the API group for an operation based on its route path.
+
+        Args:
+            op: The endpoint operation
+
+        Returns:
+            The API group name derived from the route, or None if unable to determine
+        """
+        if not hasattr(op, 'webmethod') or not op.webmethod or not hasattr(op.webmethod, 'route'):
+            return None
+
+        route = op.webmethod.route
+        if not route or not route.startswith('/'):
+            return None
+
+        # Extract API group from route path
+        # Examples: /v1/agents/list -> agents-api
+        #          /v1/responses -> responses-api
+        #          /v1/models -> models-api
+        path_parts = route.strip('/').split('/')
+
+        if len(path_parts) < 2:
+            return None
+
+        # Skip version prefix (v1, v1alpha, v1beta, etc.)
+        if path_parts[0].startswith('v1'):
+            if len(path_parts) < 2:
+                return None
+            api_segment = path_parts[1]
+        else:
+            api_segment = path_parts[0]
+
+        # Convert to supplementary file naming convention
+        # agents -> agents-api, responses -> responses-api, etc.
+        return f"{api_segment}-api"
+
+    def _load_supplemental_content(self, api_group: str | None) -> str:
+        """
+        Load supplemental content for an API group based on stability level.
+
+        Follows this resolution order:
+        1. docs/supplementary/{stability}/{api_group}.md
+        2. docs/supplementary/shared/{api_group}.md (fallback)
+        3. Empty string if no files found
+
+        Args:
+            api_group: The API group name (e.g., "agents-responses-api"), or None if no mapping exists
+
+        Returns:
+            The supplemental content as markdown string, or empty string if not found
+        """
+        if not api_group:
+            return ""
+
+        base_path = Path(__file__).parent.parent.parent / "supplementary"
+
+        # Try stability-specific content first if stability filter is set
+        if self.options.stability_filter:
+            stability_path = base_path / self.options.stability_filter / f"{api_group}.md"
+            if stability_path.exists():
+                try:
+                    return stability_path.read_text(encoding="utf-8")
+                except Exception as e:
+                    print(f"Warning: Could not read stability-specific supplemental content from {stability_path}: {e}")
+
+        # Fall back to shared content
+        shared_path = base_path / "shared" / f"{api_group}.md"
+        if shared_path.exists():
+            try:
+                return shared_path.read_text(encoding="utf-8")
+            except Exception as e:
+                print(f"Warning: Could not read shared supplemental content from {shared_path}: {e}")
+
+        # No supplemental content found
+        return ""
+
    def _build_operation(self, op: EndpointOperation) -> Operation:
        if op.defining_class.__name__ in [
            "SyntheticDataGeneration",
@ -597,6 +678,27 @@ class Generator:
        # parameters passed anywhere
        parameters = path_parameters + query_parameters

+        # Build extra body parameters documentation
+        extra_body_parameters = []
+        for param_name, param_type, description in op.extra_body_params:
+            if is_type_optional(param_type):
+                inner_type: type = unwrap_optional_type(param_type)
+                required = False
+            else:
+                inner_type = param_type
+                required = True
+
+            # Use description from ExtraBodyField if available, otherwise from docstring
+            param_description = description or doc_params.get(param_name)
+
+            extra_body_param = ExtraBodyParameter(
+                name=param_name,
+                schema=self.schema_builder.classdef_to_ref(inner_type),
+                description=param_description,
+                required=required,
+            )
+            extra_body_parameters.append(extra_body_param)
+
        webmethod = getattr(op.func_ref, "__webmethod__", None)
        raw_bytes_request_body = False
        if webmethod:
@ -797,10 +899,14 @@ class Generator:
        else:
            callbacks = None

-        description = "\n".join(
+        # Build base description from docstring
+        base_description = "\n".join(
            filter(None, [doc_string.short_description, doc_string.long_description])
        )

+        # Individual endpoints get clean descriptions only
+        description = base_description
+
        return Operation(
            tags=[
                getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)
@ -811,16 +917,126 @@ class Generator:
            requestBody=requestBody,
            responses=responses,
            callbacks=callbacks,
-            deprecated=True if "DEPRECATED" in op.func_name else None,
+            deprecated=getattr(op.webmethod, "deprecated", False)
+            or "DEPRECATED" in op.func_name,
            security=[] if op.public else None,
+            extraBodyParameters=extra_body_parameters if extra_body_parameters else None,
        )

+    def _get_api_stability_priority(self, api_level: str) -> int:
+        """
+        Return sorting priority for API stability levels.
+        Lower numbers = higher priority (appear first)
+
+        :param api_level: The API level (e.g., "v1", "v1beta", "v1alpha")
+        :return: Priority number for sorting
+        """
+        stability_order = {
+            "v1": 0,  # Stable - highest priority
+            "v1beta": 1,  # Beta - medium priority
+            "v1alpha": 2,  # Alpha - lowest priority
+        }
+        return stability_order.get(api_level, 999)  # Unknown levels go last
+
    def generate(self) -> Document:
        paths: Dict[str, PathItem] = {}
        endpoint_classes: Set[type] = set()
-        for op in get_endpoint_operations(
-            self.endpoint, use_examples=self.options.use_examples
-        ):
+
+        # Collect all operations and filter by stability if specified
+        operations = list(
+            get_endpoint_operations(
+                self.endpoint, use_examples=self.options.use_examples
+            )
+        )
+
+        # Filter operations by stability level if requested
+        if self.options.stability_filter:
+            filtered_operations = []
+            for op in operations:
+                deprecated = (
+                    getattr(op.webmethod, "deprecated", False)
+                    or "DEPRECATED" in op.func_name
+                )
+                stability_level = op.webmethod.level
+
+                if self.options.stability_filter == "stable":
+                    # Include v1 non-deprecated endpoints
+                    if stability_level == "v1" and not deprecated:
+                        filtered_operations.append(op)
+                elif self.options.stability_filter == "experimental":
+                    # Include v1alpha and v1beta endpoints (deprecated or not)
+                    if stability_level in ["v1alpha", "v1beta"]:
+                        filtered_operations.append(op)
+                elif self.options.stability_filter == "deprecated":
+                    # Include only deprecated endpoints
+                    if deprecated:
+                        filtered_operations.append(op)
+                elif self.options.stability_filter == "stainless":
+                    # Include both stable (v1 non-deprecated) and experimental (v1alpha, v1beta) endpoints
+                    if (stability_level == "v1" and not deprecated) or stability_level in ["v1alpha", "v1beta"]:
+                        filtered_operations.append(op)
+
+            operations = filtered_operations
+            print(
+                f"Filtered to {len(operations)} operations for stability level: {self.options.stability_filter}"
+            )
+
+        # Sort operations by multiple criteria for consistent ordering:
+        # 1. Stability level with deprecation handling (global priority):
+        #    - Active stable (v1) comes first
+        #    - Beta (v1beta) comes next
+        #    - Alpha (v1alpha) comes next
+        #    - Deprecated stable (v1 deprecated) comes last
+        # 2. Route path (group related endpoints within same stability level)
+        # 3. HTTP method (GET, POST, PUT, DELETE, PATCH)
+        # 4. Operation name (alphabetical)
+        def sort_key(op):
+            http_method_order = {
+                HTTPMethod.GET: 0,
+                HTTPMethod.POST: 1,
+                HTTPMethod.PUT: 2,
+                HTTPMethod.DELETE: 3,
+                HTTPMethod.PATCH: 4,
+            }
+
+            # Enhanced stability priority for migration pattern support
+            deprecated = getattr(op.webmethod, "deprecated", False)
+            stability_priority = self._get_api_stability_priority(op.webmethod.level)
+
+            # Deprecated versions should appear after everything else
+            # This ensures deprecated stable endpoints come last globally
+            if deprecated:
+                stability_priority += 10  # Push deprecated endpoints to the end
+
+            return (
+                stability_priority,  # Global stability handling comes first
+                op.get_route(
+                    op.webmethod
+                ),  # Group by route path within stability level
+                http_method_order.get(op.http_method, 999),
+                op.func_name,
+            )
+
+        operations.sort(key=sort_key)
+
+        # Debug output for migration pattern tracking
+        migration_routes = {}
+        for op in operations:
+            route_key = (op.get_route(op.webmethod), op.http_method)
+            if route_key not in migration_routes:
+                migration_routes[route_key] = []
+            migration_routes[route_key].append(
+                (op.webmethod.level, getattr(op.webmethod, "deprecated", False))
+            )
+
+        for route_key, versions in migration_routes.items():
+            if len(versions) > 1:
+                print(f"Migration pattern detected for {route_key[1]} {route_key[0]}:")
+                for level, deprecated in versions:
+                    status = "DEPRECATED" if deprecated else "ACTIVE"
+                    print(f"  - {level} ({status})")
+
+        for op in operations:
            endpoint_classes.add(op.defining_class)

            operation = self._build_operation(op)
@ -851,10 +1067,22 @@ class Generator:
            doc_string = parse_type(cls)
            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
                continue
+
+            # Add supplemental content to tag pages
+            api_group = f"{cls.__name__.lower()}-api"
+            supplemental_content = self._load_supplemental_content(api_group)
+
+            tag_description = doc_string.long_description or ""
+            if supplemental_content:
+                if tag_description:
+                    tag_description = f"{tag_description}\n\n{supplemental_content}"
+                else:
+                    tag_description = supplemental_content
+
            operation_tags.append(
                Tag(
                    name=cls.__name__,
-                    description=doc_string.long_description,
+                    description=tag_description,
                    displayName=doc_string.short_description,
                )
            )
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@ -19,10 +19,12 @@ from llama_stack.strong_typing.inspection import get_signature

 from typing import get_origin, get_args

-from fastapi import UploadFile 
+from fastapi import UploadFile
 from fastapi.params import File, Form
 from typing import Annotated

+from llama_stack.schema_utils import ExtraBodyField
+

 def split_prefix(
    s: str, sep: str, prefix: Union[str, Iterable[str]]
@ -89,6 +91,7 @@ class EndpointOperation:
    :param query_params: Parameters of the operation signature that are passed in the query string as `key=value` pairs.
    :param request_params: The parameter that corresponds to the data transmitted in the request body.
    :param multipart_params: Parameters that indicate multipart/form-data request body.
+    :param extra_body_params: Parameters that arrive via extra_body and are documented but not in SDK.
    :param event_type: The Python type of the data that is transmitted out-of-band (e.g. via websockets) while the operation is in progress.
    :param response_type: The Python type of the data that is transmitted in the response body.
    :param http_method: The HTTP method used to invoke the endpoint such as POST, GET or PUT.
@ -106,6 +109,7 @@ class EndpointOperation:
    query_params: List[OperationParameter]
    request_params: Optional[OperationParameter]
    multipart_params: List[OperationParameter]
+    extra_body_params: List[tuple[str, type, str | None]]
    event_type: Optional[type]
    response_type: type
    http_method: HTTPMethod
@ -265,6 +269,7 @@ def get_endpoint_operations(
            query_params = []
            request_params = []
            multipart_params = []
+            extra_body_params = []

            for param_name, parameter in signature.parameters.items():
                param_type = _get_annotation_type(parameter.annotation, func_ref)
@ -279,6 +284,13 @@ def get_endpoint_operations(
                        f"parameter '{param_name}' in function '{func_name}' has no type annotation"
                    )

+                # Check if this is an extra_body parameter
+                is_extra_body, extra_body_desc = _is_extra_body_param(param_type)
+                if is_extra_body:
+                    # Store in a separate list for documentation
+                    extra_body_params.append((param_name, param_type, extra_body_desc))
+                    continue  # Skip adding to request_params
+
                is_multipart = _is_multipart_param(param_type)

                if prefix in ["get", "delete"]:
@ -351,6 +363,7 @@ def get_endpoint_operations(
                query_params=query_params,
                request_params=request_params,
                multipart_params=multipart_params,
+                extra_body_params=extra_body_params,
                event_type=event_type,
                response_type=response_type,
                http_method=http_method,
@ -403,7 +416,7 @@ def get_endpoint_events(endpoint: type) -> Dict[str, type]:
 def _is_multipart_param(param_type: type) -> bool:
    """
    Check if a parameter type indicates multipart form data.
-    
+
    Returns True if the type is:
    - UploadFile
    - Annotated[UploadFile, File()]
@ -413,19 +426,38 @@ def _is_multipart_param(param_type: type) -> bool:
    """
    if param_type is UploadFile:
        return True
-    
+
    # Check for Annotated types
    origin = get_origin(param_type)
    if origin is None:
        return False
-    
+
    if origin is Annotated:
        args = get_args(param_type)
        if len(args) < 2:
            return False
-        
+
        # Check the annotations for File() or Form()
        for annotation in args[1:]:
            if isinstance(annotation, (File, Form)):
                return True
    return False
+
+
+def _is_extra_body_param(param_type: type) -> tuple[bool, str | None]:
+    """
+    Check if parameter is marked as coming from extra_body.
+
+    Returns:
+        (is_extra_body, description): Tuple of boolean and optional description
+    """
+    origin = get_origin(param_type)
+    if origin is Annotated:
+        args = get_args(param_type)
+        for annotation in args[1:]:
+            if isinstance(annotation, ExtraBodyField):
+                return True, annotation.description
+            # Also check by type name for cases where import matters
+            if type(annotation).__name__ == 'ExtraBodyField':
+                return True, getattr(annotation, 'description', None)
+    return False, None
--- a/docs/openapi_generator/pyopenapi/options.py
+++ b/docs/openapi_generator/pyopenapi/options.py
@ -54,6 +54,7 @@ class Options:
    property_description_fun: Optional[Callable[[type, str, str], str]] = None
    captions: Optional[Dict[str, str]] = None
    include_standard_error_responses: bool = True
+    stability_filter: Optional[str] = None

    default_captions: ClassVar[Dict[str, str]] = {
        "Operations": "Operations",
--- a/docs/openapi_generator/pyopenapi/specification.py
+++ b/docs/openapi_generator/pyopenapi/specification.py
@ -106,6 +106,15 @@ class Parameter:
    example: Optional[Any] = None


+@dataclass
+class ExtraBodyParameter:
+    """Represents a parameter that arrives via extra_body in the request."""
+    name: str
+    schema: SchemaOrRef
+    description: Optional[str] = None
+    required: Optional[bool] = None
+
+
@dataclass
 class Operation:
    responses: Dict[str, Union[Response, ResponseRef]]
@ -118,6 +127,7 @@ class Operation:
    callbacks: Optional[Dict[str, "Callback"]] = None
    security: Optional[List["SecurityRequirement"]] = None
    deprecated: Optional[bool] = None
+    extraBodyParameters: Optional[List[ExtraBodyParameter]] = None


@dataclass
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -52,6 +52,17 @@ class Specification:
                    if display_name:
                        tag["x-displayName"] = display_name

+            # Handle operations to rename extraBodyParameters -> x-llama-stack-extra-body-params
+            paths = json_doc.get("paths", {})
+            for path_item in paths.values():
+                if isinstance(path_item, dict):
+                    for method in ["get", "post", "put", "delete", "patch"]:
+                        operation = path_item.get(method)
+                        if operation and isinstance(operation, dict):
+                            extra_body_params = operation.pop("extraBodyParameters", None)
+                            if extra_body_params:
+                                operation["x-llama-stack-extra-body-params"] = extra_body_params
+
        return json_doc

    def get_json_string(self, pretty_print: bool = False) -> str:
--- a/docs/sidebars.ts
+++ b/docs/sidebars.ts
@ -16,7 +16,7 @@ const sidebars: SidebarsConfig = {
    {
      type: 'category',
      label: 'Getting Started',
-      collapsed: false,
+      collapsed: true,
      items: [
        'getting_started/quickstart',
        'getting_started/detailed_tutorial',
@ -26,7 +26,7 @@ const sidebars: SidebarsConfig = {
    {
      type: 'category',
      label: 'Concepts',
-      collapsed: false,
+      collapsed: true,
      items: [
        'concepts/index',
        'concepts/architecture',
@ -48,7 +48,7 @@ const sidebars: SidebarsConfig = {
    {
      type: 'category',
      label: 'Distributions',
-      collapsed: false,
+      collapsed: true,
      items: [
        'distributions/index',
        'distributions/list_of_distributions',
@ -93,7 +93,7 @@ const sidebars: SidebarsConfig = {
    {
      type: 'category',
      label: 'Providers',
-      collapsed: false,
+      collapsed: true,
      items: [
        'providers/index',
        {
@ -276,7 +276,7 @@ const sidebars: SidebarsConfig = {
    {
      type: 'category',
      label: 'Building Applications',
-      collapsed: false,
+      collapsed: true,
      items: [
        'building_applications/index',
        'building_applications/rag',
@ -293,7 +293,7 @@ const sidebars: SidebarsConfig = {
    {
      type: 'category',
      label: 'Advanced APIs',
-      collapsed: false,
+      collapsed: true,
      items: [
        'advanced_apis/post_training',
        'advanced_apis/evaluation',
@ -303,7 +303,7 @@ const sidebars: SidebarsConfig = {
    {
      type: 'category',
      label: 'Deploying',
-      collapsed: false,
+      collapsed: true,
      items: [
        'deploying/index',
        'deploying/kubernetes_deployment',
@ -313,7 +313,7 @@ const sidebars: SidebarsConfig = {
    {
      type: 'category',
      label: 'Contributing',
-      collapsed: false,
+      collapsed: true,
      items: [
        'contributing/index',
        'contributing/new_api_provider',
@ -324,7 +324,7 @@ const sidebars: SidebarsConfig = {
    {
      type: 'category',
      label: 'References',
-      collapsed: false,
+      collapsed: true,
      items: [
        'references/index',
        'references/llama_cli_reference/index',
@ -335,8 +335,10 @@ const sidebars: SidebarsConfig = {
    },
  ],

-  // API Reference sidebar - use plugin-generated sidebar
-  apiSidebar: require('./docs/api/sidebar.ts').default,
+  // API Reference sidebars - use plugin-generated sidebars
+  stableApiSidebar: require('./docs/api/sidebar.ts').default,
+  experimentalApiSidebar: require('./docs/api-experimental/sidebar.ts').default,
+  deprecatedApiSidebar: require('./docs/api-deprecated/sidebar.ts').default,
 };

 export default sidebars;
--- a/docs/src/css/custom.css
+++ b/docs/src/css/custom.css
@ -189,3 +189,29 @@ button[class*="button"]:hover,
 .pagination-nav__link--prev:hover {
  background-color: #f3f4f6 !important;
 }
+
+/* Deprecated endpoint styling */
+.menu__list-item--deprecated .menu__link {
+  text-decoration: line-through !important;
+  opacity: 0.7;
+  font-style: italic;
+}
+
+.menu__list-item--deprecated .menu__link:hover {
+  opacity: 0.9;
+}
+
+/* Deprecated endpoint badges - slightly muted */
+.menu__list-item--deprecated.api-method > .menu__link::before {
+  opacity: 0.7;
+  border-style: dashed !important;
+}
+
+/* Dark theme adjustments for deprecated endpoints */
+[data-theme='dark'] .menu__list-item--deprecated .menu__link {
+  opacity: 0.6;
+}
+
+[data-theme='dark'] .menu__list-item--deprecated .menu__link:hover {
+  opacity: 0.8;
+}
--- a/docs/static/deprecated-llama-stack-spec.html
+++ b/docs/static/deprecated-llama-stack-spec.html
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
--- a/docs/static/experimental-llama-stack-spec.html
+++ b/docs/static/experimental-llama-stack-spec.html
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
--- a/docs/static/img/favicon-16x16.png
+++ b/docs/static/img/favicon-16x16.png
--- a/docs/static/img/favicon-32x32.png
+++ b/docs/static/img/favicon-32x32.png
--- a/docs/static/img/favicon-48x48.png
+++ b/docs/static/img/favicon-48x48.png
--- a/docs/static/img/favicon-64x64.png
+++ b/docs/static/img/favicon-64x64.png
--- a/docs/static/img/favicon.ico
+++ b/docs/static/img/favicon.ico
--- a/docs/static/img/favicon.png
+++ b/docs/static/img/favicon.png
--- a/docs/static/img/llama-stack.png
+++ b/docs/static/img/llama-stack.png
--- a/docs/static/llama-stack-spec.html
+++ b/docs/static/llama-stack-spec.html
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
--- a/docs/static/llama-stack.png
+++ b/docs/static/llama-stack.png
--- a/docs/static/site.webmanifest
+++ b/docs/static/site.webmanifest
@ -0,0 +1,36 @@
+{
+  "name": "Llama Stack",
+  "short_name": "Llama Stack",
+  "description": "The open-source framework for building generative AI applications",
+  "start_url": "/",
+  "display": "standalone",
+  "theme_color": "#7C3AED",
+  "background_color": "#ffffff",
+  "icons": [
+    {
+      "src": "/img/favicon-16x16.png",
+      "sizes": "16x16",
+      "type": "image/png"
+    },
+    {
+      "src": "/img/favicon-32x32.png",
+      "sizes": "32x32",
+      "type": "image/png"
+    },
+    {
+      "src": "/img/favicon-48x48.png",
+      "sizes": "48x48",
+      "type": "image/png"
+    },
+    {
+      "src": "/img/favicon-64x64.png",
+      "sizes": "64x64",
+      "type": "image/png"
+    },
+    {
+      "src": "/img/llama-stack-logo.png",
+      "sizes": "200x200",
+      "type": "image/png"
+    }
+  ]
+}
--- a/docs/static/stainless-llama-stack-spec.html
+++ b/docs/static/stainless-llama-stack-spec.html
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
--- a/docs/supplementary/deprecated/agents-api.md
+++ b/docs/supplementary/deprecated/agents-api.md
@ -0,0 +1,9 @@
+## Deprecated APIs
+
+> **⚠️ DEPRECATED**: These APIs are provided for migration reference and will be removed in future versions. Not recommended for new projects.
+
+### Migration Guidance
+
+If you are using deprecated versions of the Agents or Responses APIs, please migrate to:
+
+- **Responses API**: Use the stable v1 Responses API endpoints
--- a/docs/supplementary/experimental/agents-api.md
+++ b/docs/supplementary/experimental/agents-api.md
@ -0,0 +1,21 @@
+## Agents API (Experimental)
+
+> **🧪 EXPERIMENTAL**: This API is in preview and may change based on user feedback. Great for exploring new capabilities and providing feedback to influence the final design.
+
+Main functionalities provided by this API:
+
+- Create agents with specific instructions and ability to use tools.
+- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
+- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
+- Agents can be provided with various shields (see the Safety API for more details).
+- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
+
+### 🧪 Feedback Welcome
+
+This API is actively being developed. We welcome feedback on:
+- API design and usability
+- Performance characteristics
+- Missing features or capabilities
+- Integration patterns
+
+**Provide Feedback**: [GitHub Discussions](https://github.com/llamastack/llama-stack/discussions) or [GitHub Issues](https://github.com/llamastack/llama-stack/issues)
--- a/docs/supplementary/stable/agents-api.md
+++ b/docs/supplementary/stable/agents-api.md
@ -0,0 +1,40 @@
+## Responses API
+
+The Responses API provides OpenAI-compatible functionality with enhanced capabilities for dynamic, stateful interactions.
+
+> **✅ STABLE**: This API is production-ready with backward compatibility guarantees. Recommended for production applications.
+
+### ✅ Supported Tools
+
+The Responses API supports the following tool types:
+
+- **`web_search`**: Search the web for current information and real-time data
+- **`file_search`**: Search through uploaded files and vector stores
+  - Supports dynamic `vector_store_ids` per call
+  - Compatible with OpenAI file search patterns
+- **`function`**: Call custom functions with JSON schema validation
+- **`mcp_tool`**: Model Context Protocol integration
+
+### ✅ Supported Fields & Features
+
+**Core Capabilities:**
+- **Dynamic Configuration**: Switch models, vector stores, and tools per request without pre-configuration
+- **Conversation Branching**: Use `previous_response_id` to branch conversations and explore different paths
+- **Rich Annotations**: Automatic file citations, URL citations, and container file citations
+- **Status Tracking**: Monitor tool call execution status and handle failures gracefully
+
+### 🚧 Work in Progress
+
+- Full real-time response streaming support
+- `tool_choice` parameter
+- `max_tool_calls` parameter
+- Built-in tools (code interpreter, containers API)
+- Safety & guardrails
+- `reasoning` capabilities
+- `service_tier`
+- `logprobs`
+- `max_output_tokens`
+- `metadata` handling
+- `instructions`
+- `incomplete_details`
+- `background`
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
 from llama_stack.apis.safety import SafetyViolation
 from llama_stack.apis.tools import ToolDef
 from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
-from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+from llama_stack.schema_utils import ExtraBodyField, json_schema_type, register_schema, webmethod

 from .openai_responses import (
    ListOpenAIResponseInputItem,
@ -42,6 +42,20 @@ from .openai_responses import (
 )


+@json_schema_type
+class ResponseShieldSpec(BaseModel):
+    """Specification for a shield to apply during response generation.
+
+    :param type: The type/identifier of the shield.
+    """
+
+    type: str
+    # TODO: more fields to be added for shield configuration
+
+
+ResponseShield = str | ResponseShieldSpec
+
+
 class Attachment(BaseModel):
    """An attachment to an agent turn.

@ -472,20 +486,23 @@ class AgentStepResponse(BaseModel):

@runtime_checkable
 class Agents(Protocol):
-    """Agents API for creating and interacting with agentic systems.
+    """Agents

-    Main functionalities provided by this API:
-    - Create agents with specific instructions and ability to use tools.
-    - Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
-    - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
-    - Agents can be provided with various shields (see the Safety API for more details).
-    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
-    """
+    APIs for creating and interacting with agentic systems."""

    @webmethod(
-        route="/agents", method="POST", descriptive_name="create_agent", deprecated=True, level=LLAMA_STACK_API_V1
+        route="/agents",
+        method="POST",
+        descriptive_name="create_agent",
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
+    @webmethod(
+        route="/agents",
+        method="POST",
+        descriptive_name="create_agent",
+        level=LLAMA_STACK_API_V1ALPHA,
    )
-    @webmethod(route="/agents", method="POST", descriptive_name="create_agent", level=LLAMA_STACK_API_V1ALPHA)
    async def create_agent(
        self,
        agent_config: AgentConfig,
@ -648,8 +665,17 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
-    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
+    @webmethod(
+        route="/agents/{agent_id}/session/{session_id}",
+        method="GET",
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
+    @webmethod(
+        route="/agents/{agent_id}/session/{session_id}",
+        method="GET",
+        level=LLAMA_STACK_API_V1ALPHA,
+    )
    async def get_agents_session(
        self,
        session_id: str,
@ -666,9 +692,16 @@ class Agents(Protocol):
        ...

    @webmethod(
-        route="/agents/{agent_id}/session/{session_id}", method="DELETE", deprecated=True, level=LLAMA_STACK_API_V1
+        route="/agents/{agent_id}/session/{session_id}",
+        method="DELETE",
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
+    @webmethod(
+        route="/agents/{agent_id}/session/{session_id}",
+        method="DELETE",
+        level=LLAMA_STACK_API_V1ALPHA,
    )
-    @webmethod(route="/agents/{agent_id}/session/{session_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def delete_agents_session(
        self,
        session_id: str,
@ -681,7 +714,12 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}", method="DELETE", deprecated=True, level=LLAMA_STACK_API_V1)
+    @webmethod(
+        route="/agents/{agent_id}",
+        method="DELETE",
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
    @webmethod(route="/agents/{agent_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
    async def delete_agent(
        self,
@ -704,7 +742,12 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
+    @webmethod(
+        route="/agents/{agent_id}",
+        method="GET",
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
    @webmethod(route="/agents/{agent_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def get_agent(self, agent_id: str) -> Agent:
        """Describe an agent by its ID.
@ -714,7 +757,12 @@ class Agents(Protocol):
        """
        ...

-    @webmethod(route="/agents/{agent_id}/sessions", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
+    @webmethod(
+        route="/agents/{agent_id}/sessions",
+        method="GET",
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
    @webmethod(route="/agents/{agent_id}/sessions", method="GET", level=LLAMA_STACK_API_V1ALPHA)
    async def list_agent_sessions(
        self,
@ -738,6 +786,12 @@ class Agents(Protocol):
    #
    # Both of these APIs are inherently stateful.

+    @webmethod(
+        route="/openai/v1/responses/{response_id}",
+        method="GET",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(route="/responses/{response_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_openai_response(
        self,
@ -750,6 +804,7 @@ class Agents(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/responses", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses", method="POST", level=LLAMA_STACK_API_V1)
    async def create_openai_response(
        self,
@ -764,6 +819,12 @@ class Agents(Protocol):
        tools: list[OpenAIResponseInputTool] | None = None,
        include: list[str] | None = None,
        max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
+        shields: Annotated[
+            list[ResponseShield] | None,
+            ExtraBodyField(
+                "List of shields to apply during response generation. Shields provide safety and content moderation."
+            ),
+        ] = None,
    ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
        """Create a new OpenAI response.

@ -771,10 +832,12 @@ class Agents(Protocol):
        :param model: The underlying LLM used for completions.
        :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
        :param include: (Optional) Additional fields to include in the response.
+        :param shields: (Optional) List of shields to apply during response generation. Can be shield IDs (strings) or shield specifications.
        :returns: An OpenAIResponseObject.
        """
        ...

+    @webmethod(route="/openai/v1/responses", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses", method="GET", level=LLAMA_STACK_API_V1)
    async def list_openai_responses(
        self,
@ -793,6 +856,9 @@ class Agents(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
+    )
    @webmethod(route="/responses/{response_id}/input_items", method="GET", level=LLAMA_STACK_API_V1)
    async def list_openai_response_input_items(
        self,
@ -815,6 +881,7 @@ class Agents(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
        """Delete an OpenAI response by its ID.
--- a/llama_stack/apis/agents/openai_responses.py
+++ b/llama_stack/apis/agents/openai_responses.py
@ -888,6 +888,10 @@ class OpenAIResponseObjectWithInput(OpenAIResponseObject):

    input: list[OpenAIResponseInput]

+    def to_response_object(self) -> OpenAIResponseObject:
+        """Convert to OpenAIResponseObject by excluding input field."""
+        return OpenAIResponseObject(**{k: v for k, v in self.model_dump().items() if k != "input"})
+

@json_schema_type
 class ListOpenAIResponseObject(BaseModel):
--- a/llama_stack/apis/batches/batches.py
+++ b/llama_stack/apis/batches/batches.py
@ -43,6 +43,7 @@ class Batches(Protocol):
    Note: This API is currently under active development and may undergo changes.
    """

+    @webmethod(route="/openai/v1/batches", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches", method="POST", level=LLAMA_STACK_API_V1)
    async def create_batch(
        self,
@ -63,6 +64,7 @@ class Batches(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/batches/{batch_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches/{batch_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def retrieve_batch(self, batch_id: str) -> BatchObject:
        """Retrieve information about a specific batch.
@ -72,6 +74,7 @@ class Batches(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches/{batch_id}/cancel", method="POST", level=LLAMA_STACK_API_V1)
    async def cancel_batch(self, batch_id: str) -> BatchObject:
        """Cancel a batch that is in progress.
@ -81,6 +84,7 @@ class Batches(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/batches", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/batches", method="GET", level=LLAMA_STACK_API_V1)
    async def list_batches(
        self,
--- a/llama_stack/apis/conversations/init.py
+++ b/llama_stack/apis/conversations/init.py
@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .conversations import (
+    Conversation,
+    ConversationCreateRequest,
+    ConversationDeletedResource,
+    ConversationItem,
+    ConversationItemCreateRequest,
+    ConversationItemDeletedResource,
+    ConversationItemList,
+    Conversations,
+    ConversationUpdateRequest,
+    Metadata,
+)
+
+__all__ = [
+    "Conversation",
+    "ConversationCreateRequest",
+    "ConversationDeletedResource",
+    "ConversationItem",
+    "ConversationItemCreateRequest",
+    "ConversationItemDeletedResource",
+    "ConversationItemList",
+    "Conversations",
+    "ConversationUpdateRequest",
+    "Metadata",
+]
--- a/llama_stack/apis/conversations/conversations.py
+++ b/llama_stack/apis/conversations/conversations.py
@ -0,0 +1,260 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import Annotated, Literal, Protocol, runtime_checkable
+
+from openai import NOT_GIVEN
+from openai._types import NotGiven
+from openai.types.responses.response_includable import ResponseIncludable
+from pydantic import BaseModel, Field
+
+from llama_stack.apis.agents.openai_responses import (
+    OpenAIResponseMessage,
+    OpenAIResponseOutputMessageFileSearchToolCall,
+    OpenAIResponseOutputMessageFunctionToolCall,
+    OpenAIResponseOutputMessageMCPCall,
+    OpenAIResponseOutputMessageMCPListTools,
+    OpenAIResponseOutputMessageWebSearchToolCall,
+)
+from llama_stack.apis.version import LLAMA_STACK_API_V1
+from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
+from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
+
+Metadata = dict[str, str]
+
+
+@json_schema_type
+class Conversation(BaseModel):
+    """OpenAI-compatible conversation object."""
+
+    id: str = Field(..., description="The unique ID of the conversation.")
+    object: Literal["conversation"] = Field(
+        default="conversation", description="The object type, which is always conversation."
+    )
+    created_at: int = Field(
+        ..., description="The time at which the conversation was created, measured in seconds since the Unix epoch."
+    )
+    metadata: Metadata | None = Field(
+        default=None,
+        description="Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard.",
+    )
+    items: list[dict] | None = Field(
+        default=None,
+        description="Initial items to include in the conversation context. You may add up to 20 items at a time.",
+    )
+
+
+@json_schema_type
+class ConversationMessage(BaseModel):
+    """OpenAI-compatible message item for conversations."""
+
+    id: str = Field(..., description="unique identifier for this message")
+    content: list[dict] = Field(..., description="message content")
+    role: str = Field(..., description="message role")
+    status: str = Field(..., description="message status")
+    type: Literal["message"] = "message"
+    object: Literal["message"] = "message"
+
+
+ConversationItem = Annotated[
+    OpenAIResponseMessage
+    | OpenAIResponseOutputMessageFunctionToolCall
+    | OpenAIResponseOutputMessageFileSearchToolCall
+    | OpenAIResponseOutputMessageWebSearchToolCall
+    | OpenAIResponseOutputMessageMCPCall
+    | OpenAIResponseOutputMessageMCPListTools,
+    Field(discriminator="type"),
+]
+register_schema(ConversationItem, name="ConversationItem")
+
+# Using OpenAI types directly caused issues but some notes for reference:
+# Note that ConversationItem is a Annotated Union of the types below:
+# from openai.types.responses import *
+# from openai.types.responses.response_item import *
+# from openai.types.conversations import ConversationItem
+# f = [
+#     ResponseFunctionToolCallItem,
+#     ResponseFunctionToolCallOutputItem,
+#     ResponseFileSearchToolCall,
+#     ResponseFunctionWebSearch,
+#     ImageGenerationCall,
+#     ResponseComputerToolCall,
+#     ResponseComputerToolCallOutputItem,
+#     ResponseReasoningItem,
+#     ResponseCodeInterpreterToolCall,
+#     LocalShellCall,
+#     LocalShellCallOutput,
+#     McpListTools,
+#     McpApprovalRequest,
+#     McpApprovalResponse,
+#     McpCall,
+#     ResponseCustomToolCall,
+#     ResponseCustomToolCallOutput
+# ]
+
+
+@json_schema_type
+class ConversationCreateRequest(BaseModel):
+    """Request body for creating a conversation."""
+
+    items: list[ConversationItem] | None = Field(
+        default=[],
+        description="Initial items to include in the conversation context. You may add up to 20 items at a time.",
+        max_length=20,
+    )
+    metadata: Metadata | None = Field(
+        default={},
+        description="Set of 16 key-value pairs that can be attached to an object. Useful for storing additional information",
+        max_length=16,
+    )
+
+
+@json_schema_type
+class ConversationUpdateRequest(BaseModel):
+    """Request body for updating a conversation."""
+
+    metadata: Metadata = Field(
+        ...,
+        description="Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters.",
+    )
+
+
+@json_schema_type
+class ConversationDeletedResource(BaseModel):
+    """Response for deleted conversation."""
+
+    id: str = Field(..., description="The deleted conversation identifier")
+    object: str = Field(default="conversation.deleted", description="Object type")
+    deleted: bool = Field(default=True, description="Whether the object was deleted")
+
+
+@json_schema_type
+class ConversationItemCreateRequest(BaseModel):
+    """Request body for creating conversation items."""
+
+    items: list[ConversationItem] = Field(
+        ...,
+        description="Items to include in the conversation context. You may add up to 20 items at a time.",
+        max_length=20,
+    )
+
+
+@json_schema_type
+class ConversationItemList(BaseModel):
+    """List of conversation items with pagination."""
+
+    object: str = Field(default="list", description="Object type")
+    data: list[ConversationItem] = Field(..., description="List of conversation items")
+    first_id: str | None = Field(default=None, description="The ID of the first item in the list")
+    last_id: str | None = Field(default=None, description="The ID of the last item in the list")
+    has_more: bool = Field(default=False, description="Whether there are more items available")
+
+
+@json_schema_type
+class ConversationItemDeletedResource(BaseModel):
+    """Response for deleted conversation item."""
+
+    id: str = Field(..., description="The deleted item identifier")
+    object: str = Field(default="conversation.item.deleted", description="Object type")
+    deleted: bool = Field(default=True, description="Whether the object was deleted")
+
+
+@runtime_checkable
+@trace_protocol
+class Conversations(Protocol):
+    """Protocol for conversation management operations."""
+
+    @webmethod(route="/conversations", method="POST", level=LLAMA_STACK_API_V1)
+    async def create_conversation(
+        self, items: list[ConversationItem] | None = None, metadata: Metadata | None = None
+    ) -> Conversation:
+        """Create a conversation.
+
+        :param items: Initial items to include in the conversation context.
+        :param metadata: Set of key-value pairs that can be attached to an object.
+        :returns: The created conversation object.
+        """
+        ...
+
+    @webmethod(route="/conversations/{conversation_id}", method="GET", level=LLAMA_STACK_API_V1)
+    async def get_conversation(self, conversation_id: str) -> Conversation:
+        """Get a conversation with the given ID.
+
+        :param conversation_id: The conversation identifier.
+        :returns: The conversation object.
+        """
+        ...
+
+    @webmethod(route="/conversations/{conversation_id}", method="POST", level=LLAMA_STACK_API_V1)
+    async def update_conversation(self, conversation_id: str, metadata: Metadata) -> Conversation:
+        """Update a conversation's metadata with the given ID.
+
+        :param conversation_id: The conversation identifier.
+        :param metadata: Set of key-value pairs that can be attached to an object.
+        :returns: The updated conversation object.
+        """
+        ...
+
+    @webmethod(route="/conversations/{conversation_id}", method="DELETE", level=LLAMA_STACK_API_V1)
+    async def openai_delete_conversation(self, conversation_id: str) -> ConversationDeletedResource:
+        """Delete a conversation with the given ID.
+
+        :param conversation_id: The conversation identifier.
+        :returns: The deleted conversation resource.
+        """
+        ...
+
+    @webmethod(route="/conversations/{conversation_id}/items", method="POST", level=LLAMA_STACK_API_V1)
+    async def add_items(self, conversation_id: str, items: list[ConversationItem]) -> ConversationItemList:
+        """Create items in the conversation.
+
+        :param conversation_id: The conversation identifier.
+        :param items: Items to include in the conversation context.
+        :returns: List of created items.
+        """
+        ...
+
+    @webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="GET", level=LLAMA_STACK_API_V1)
+    async def retrieve(self, conversation_id: str, item_id: str) -> ConversationItem:
+        """Retrieve a conversation item.
+
+        :param conversation_id: The conversation identifier.
+        :param item_id: The item identifier.
+        :returns: The conversation item.
+        """
+        ...
+
+    @webmethod(route="/conversations/{conversation_id}/items", method="GET", level=LLAMA_STACK_API_V1)
+    async def list(
+        self,
+        conversation_id: str,
+        after: str | NotGiven = NOT_GIVEN,
+        include: list[ResponseIncludable] | NotGiven = NOT_GIVEN,
+        limit: int | NotGiven = NOT_GIVEN,
+        order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,
+    ) -> ConversationItemList:
+        """List items in the conversation.
+
+        :param conversation_id: The conversation identifier.
+        :param after: An item ID to list items after, used in pagination.
+        :param include: Specify additional output data to include in the response.
+        :param limit: A limit on the number of objects to be returned (1-100, default 20).
+        :param order: The order to return items in (asc or desc, default desc).
+        :returns: List of conversation items.
+        """
+        ...
+
+    @webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="DELETE", level=LLAMA_STACK_API_V1)
+    async def openai_delete_conversation_item(
+        self, conversation_id: str, item_id: str
+    ) -> ConversationItemDeletedResource:
+        """Delete a conversation item.
+
+        :param conversation_id: The conversation identifier.
+        :param item_id: The item identifier.
+        :returns: The deleted item resource.
+        """
+        ...
--- a/llama_stack/apis/datasetio/datasetio.py
+++ b/llama_stack/apis/datasetio/datasetio.py
@ -8,7 +8,7 @@ from typing import Any, Protocol, runtime_checkable

 from llama_stack.apis.common.responses import PaginatedResponse
 from llama_stack.apis.datasets import Dataset
-from llama_stack.apis.version import LLAMA_STACK_API_V1
+from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1BETA
 from llama_stack.schema_utils import webmethod


@ -21,7 +21,8 @@ class DatasetIO(Protocol):
    # keeping for aligning with inference/safety, but this is not used
    dataset_store: DatasetStore

-    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
    async def iterrows(
        self,
        dataset_id: str,
@ -45,7 +46,10 @@ class DatasetIO(Protocol):
        """
        ...

-    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST", level=LLAMA_STACK_API_V1)
+    @webmethod(
+        route="/datasetio/append-rows/{dataset_id:path}", method="POST", deprecated=True, level=LLAMA_STACK_API_V1
+    )
+    @webmethod(route="/datasetio/append-rows/{dataset_id:path}", method="POST", level=LLAMA_STACK_API_V1BETA)
    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
        """Append rows to a dataset.

--- a/llama_stack/apis/datasets/datasets.py
+++ b/llama_stack/apis/datasets/datasets.py
@ -10,7 +10,7 @@ from typing import Annotated, Any, Literal, Protocol
 from pydantic import BaseModel, Field

 from llama_stack.apis.resource import Resource, ResourceType
-from llama_stack.apis.version import LLAMA_STACK_API_V1
+from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1BETA
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod


@ -146,7 +146,8 @@ class ListDatasetsResponse(BaseModel):


 class Datasets(Protocol):
-    @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasets", method="POST", deprecated=True, level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1BETA)
    async def register_dataset(
        self,
        purpose: DatasetPurpose,
@ -215,7 +216,8 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasets/{dataset_id:path}", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasets/{dataset_id:path}", method="GET", level=LLAMA_STACK_API_V1BETA)
    async def get_dataset(
        self,
        dataset_id: str,
@ -227,7 +229,8 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets", method="GET", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasets", method="GET", deprecated=True, level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasets", method="GET", level=LLAMA_STACK_API_V1BETA)
    async def list_datasets(self) -> ListDatasetsResponse:
        """List all datasets.

@ -235,7 +238,8 @@ class Datasets(Protocol):
        """
        ...

-    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", deprecated=True, level=LLAMA_STACK_API_V1)
+    @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1BETA)
    async def unregister_dataset(
        self,
        dataset_id: str,
--- a/llama_stack/apis/datatypes.py
+++ b/llama_stack/apis/datatypes.py
@ -129,6 +129,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
    tool_groups = "tool_groups"
    files = "files"
    prompts = "prompts"
+    conversations = "conversations"

    # built-in API
    inspect = "inspect"
--- a/llama_stack/apis/files/files.py
+++ b/llama_stack/apis/files/files.py
@ -105,6 +105,7 @@ class OpenAIFileDeleteResponse(BaseModel):
@trace_protocol
 class Files(Protocol):
    # OpenAI Files API Endpoints
+    @webmethod(route="/openai/v1/files", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_upload_file(
        self,
@ -127,6 +128,7 @@ class Files(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/files", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_files(
        self,
@ -146,6 +148,7 @@ class Files(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files/{file_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_file(
        self,
@ -159,6 +162,7 @@ class Files(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files/{file_id}", method="DELETE", level=LLAMA_STACK_API_V1)
    async def openai_delete_file(
        self,
@ -172,6 +176,7 @@ class Files(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/files/{file_id}/content", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/files/{file_id}/content", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_file_content(
        self,
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@ -27,14 +27,12 @@ from llama_stack.models.llama.datatypes import (
    StopReason,
    ToolCall,
    ToolDefinition,
-    ToolParamDefinition,
    ToolPromptFormat,
 )
 from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

 register_schema(ToolCall)
-register_schema(ToolParamDefinition)
 register_schema(ToolDefinition)

 from enum import StrEnum
@ -1008,67 +1006,6 @@ class InferenceProvider(Protocol):

    model_store: ModelStore | None = None

-    async def completion(
-        self,
-        model_id: str,
-        content: InterleavedContent,
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-    ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
-        """Generate a completion for the given content using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param content: The content to generate a completion for.
-        :param sampling_params: (Optional) Parameters to control the sampling strategy.
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding.
-        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
-        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
-        :returns: If stream=False, returns a CompletionResponse with the full completion.
-                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
-        """
-        ...
-
-    async def chat_completion(
-        self,
-        model_id: str,
-        messages: list[Message],
-        sampling_params: SamplingParams | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_choice: ToolChoice | None = ToolChoice.auto,
-        tool_prompt_format: ToolPromptFormat | None = None,
-        response_format: ResponseFormat | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-        tool_config: ToolConfig | None = None,
-    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
-        """Generate a chat completion for the given messages using the specified model.
-
-        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
-        :param messages: List of messages in the conversation.
-        :param sampling_params: Parameters to control the sampling strategy.
-        :param tools: (Optional) List of tool definitions available to the model.
-        :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
-            .. deprecated::
-               Use tool_config instead.
-        :param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
-            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
-            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
-            - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
-            .. deprecated::
-               Use tool_config instead.
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding. There are two options:
-            - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format.
-            - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it.
-        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
-        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
-        :param tool_config: (Optional) Configuration for tool use.
-        :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
-                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk.
-        """
-        ...
-
    @webmethod(route="/inference/rerank", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def rerank(
        self,
@ -1088,6 +1025,7 @@ class InferenceProvider(Protocol):
        raise NotImplementedError("Reranking is not implemented")
        return  # this is so mypy's safe-super rule will consider the method concrete

+    @webmethod(route="/openai/v1/completions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_completion(
        self,
@ -1139,6 +1077,7 @@ class InferenceProvider(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/chat/completions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/chat/completions", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_chat_completion(
        self,
@ -1195,6 +1134,7 @@ class InferenceProvider(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/embeddings", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/embeddings", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_embeddings(
        self,
@ -1224,6 +1164,7 @@ class Inference(InferenceProvider):
    - Embedding models: these models generate embeddings to be used for semantic search.
    """

+    @webmethod(route="/openai/v1/chat/completions", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/chat/completions", method="GET", level=LLAMA_STACK_API_V1)
    async def list_chat_completions(
        self,
@ -1242,6 +1183,9 @@ class Inference(InferenceProvider):
        """
        raise NotImplementedError("List chat completions is not implemented")

+    @webmethod(
+        route="/openai/v1/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
+    )
    @webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
        """Describe a chat completion by its ID.
--- a/llama_stack/apis/models/models.py
+++ b/llama_stack/apis/models/models.py
@ -111,6 +111,14 @@ class Models(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/models", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
+    async def openai_list_models(self) -> OpenAIListModelsResponse:
+        """List models using the OpenAI API.
+
+        :returns: A OpenAIListModelsResponse.
+        """
+        ...
+
    @webmethod(route="/models/{model_id:path}", method="GET", level=LLAMA_STACK_API_V1)
    async def get_model(
        self,
--- a/llama_stack/apis/safety/safety.py
+++ b/llama_stack/apis/safety/safety.py
@ -114,6 +114,7 @@ class Safety(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
    async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
        """Classifies if text and/or image inputs are potentially harmful.
--- a/llama_stack/apis/telemetry/telemetry.py
+++ b/llama_stack/apis/telemetry/telemetry.py
@ -16,7 +16,7 @@ from typing import (

 from pydantic import BaseModel, Field

-from llama_stack.apis.version import LLAMA_STACK_API_V1
+from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
 from llama_stack.models.llama.datatypes import Primitive
 from llama_stack.schema_utils import json_schema_type, register_schema, webmethod

@ -426,7 +426,14 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/traces", method="POST", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1)
+    @webmethod(
+        route="/telemetry/traces",
+        method="POST",
+        required_scope=REQUIRED_SCOPE,
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
+    @webmethod(route="/telemetry/traces", method="POST", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1ALPHA)
    async def query_traces(
        self,
        attribute_filters: list[QueryCondition] | None = None,
@ -445,7 +452,17 @@ class Telemetry(Protocol):
        ...

    @webmethod(
-        route="/telemetry/traces/{trace_id:path}", method="GET", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1
+        route="/telemetry/traces/{trace_id:path}",
+        method="GET",
+        required_scope=REQUIRED_SCOPE,
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
+    @webmethod(
+        route="/telemetry/traces/{trace_id:path}",
+        method="GET",
+        required_scope=REQUIRED_SCOPE,
+        level=LLAMA_STACK_API_V1ALPHA,
    )
    async def get_trace(self, trace_id: str) -> Trace:
        """Get a trace by its ID.
@ -459,8 +476,15 @@ class Telemetry(Protocol):
        route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}",
        method="GET",
        required_scope=REQUIRED_SCOPE,
+        deprecated=True,
        level=LLAMA_STACK_API_V1,
    )
+    @webmethod(
+        route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}",
+        method="GET",
+        required_scope=REQUIRED_SCOPE,
+        level=LLAMA_STACK_API_V1ALPHA,
+    )
    async def get_span(self, trace_id: str, span_id: str) -> Span:
        """Get a span by its ID.

@ -473,9 +497,16 @@ class Telemetry(Protocol):
    @webmethod(
        route="/telemetry/spans/{span_id:path}/tree",
        method="POST",
+        deprecated=True,
        required_scope=REQUIRED_SCOPE,
        level=LLAMA_STACK_API_V1,
    )
+    @webmethod(
+        route="/telemetry/spans/{span_id:path}/tree",
+        method="POST",
+        required_scope=REQUIRED_SCOPE,
+        level=LLAMA_STACK_API_V1ALPHA,
+    )
    async def get_span_tree(
        self,
        span_id: str,
@ -491,7 +522,14 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/spans", method="POST", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1)
+    @webmethod(
+        route="/telemetry/spans",
+        method="POST",
+        required_scope=REQUIRED_SCOPE,
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
+    @webmethod(route="/telemetry/spans", method="POST", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1ALPHA)
    async def query_spans(
        self,
        attribute_filters: list[QueryCondition],
@ -507,7 +545,8 @@ class Telemetry(Protocol):
        """
        ...

-    @webmethod(route="/telemetry/spans/export", method="POST", level=LLAMA_STACK_API_V1)
+    @webmethod(route="/telemetry/spans/export", method="POST", deprecated=True, level=LLAMA_STACK_API_V1)
+    @webmethod(route="/telemetry/spans/export", method="POST", level=LLAMA_STACK_API_V1ALPHA)
    async def save_spans_to_dataset(
        self,
        attribute_filters: list[QueryCondition],
@ -525,7 +564,17 @@ class Telemetry(Protocol):
        ...

    @webmethod(
-        route="/telemetry/metrics/{metric_name}", method="POST", required_scope=REQUIRED_SCOPE, level=LLAMA_STACK_API_V1
+        route="/telemetry/metrics/{metric_name}",
+        method="POST",
+        required_scope=REQUIRED_SCOPE,
+        deprecated=True,
+        level=LLAMA_STACK_API_V1,
+    )
+    @webmethod(
+        route="/telemetry/metrics/{metric_name}",
+        method="POST",
+        required_scope=REQUIRED_SCOPE,
+        level=LLAMA_STACK_API_V1ALPHA,
    )
    async def query_metrics(
        self,
--- a/llama_stack/apis/tools/tools.py
+++ b/llama_stack/apis/tools/tools.py
@ -7,7 +7,7 @@
 from enum import Enum
 from typing import Any, Literal, Protocol

-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 from typing_extensions import runtime_checkable

 from llama_stack.apis.common.content_types import URL, InterleavedContent
@ -19,59 +19,23 @@ from llama_stack.schema_utils import json_schema_type, webmethod
 from .rag_tool import RAGToolRuntime


-@json_schema_type
-class ToolParameter(BaseModel):
-    """Parameter definition for a tool.
-
-    :param name: Name of the parameter
-    :param parameter_type: Type of the parameter (e.g., string, integer)
-    :param description: Human-readable description of what the parameter does
-    :param required: Whether this parameter is required for tool invocation
-    :param items: Type of the elements when parameter_type is array
-    :param title: (Optional) Title of the parameter
-    :param default: (Optional) Default value for the parameter if not provided
-    """
-
-    name: str
-    parameter_type: str
-    description: str
-    required: bool = Field(default=True)
-    items: dict | None = None
-    title: str | None = None
-    default: Any | None = None
-
-
-@json_schema_type
-class Tool(Resource):
-    """A tool that can be invoked by agents.
-
-    :param type: Type of resource, always 'tool'
-    :param toolgroup_id: ID of the tool group this tool belongs to
-    :param description: Human-readable description of what the tool does
-    :param parameters: List of parameters this tool accepts
-    :param metadata: (Optional) Additional metadata about the tool
-    """
-
-    type: Literal[ResourceType.tool] = ResourceType.tool
-    toolgroup_id: str
-    description: str
-    parameters: list[ToolParameter]
-    metadata: dict[str, Any] | None = None
-
-
@json_schema_type
 class ToolDef(BaseModel):
    """Tool definition used in runtime contexts.

    :param name: Name of the tool
    :param description: (Optional) Human-readable description of what the tool does
-    :param parameters: (Optional) List of parameters this tool accepts
+    :param input_schema: (Optional) JSON Schema for tool inputs (MCP inputSchema)
+    :param output_schema: (Optional) JSON Schema for tool outputs (MCP outputSchema)
    :param metadata: (Optional) Additional metadata about the tool
+    :param toolgroup_id: (Optional) ID of the tool group this tool belongs to
    """

+    toolgroup_id: str | None = None
    name: str
    description: str | None = None
-    parameters: list[ToolParameter] | None = None
+    input_schema: dict[str, Any] | None = None
+    output_schema: dict[str, Any] | None = None
    metadata: dict[str, Any] | None = None


@ -122,7 +86,7 @@ class ToolInvocationResult(BaseModel):


 class ToolStore(Protocol):
-    async def get_tool(self, tool_name: str) -> Tool: ...
+    async def get_tool(self, tool_name: str) -> ToolDef: ...
    async def get_tool_group(self, toolgroup_id: str) -> ToolGroup: ...


@ -135,15 +99,6 @@ class ListToolGroupsResponse(BaseModel):
    data: list[ToolGroup]


-class ListToolsResponse(BaseModel):
-    """Response containing a list of tools.
-
-    :param data: List of tools
-    """
-
-    data: list[Tool]
-
-
 class ListToolDefsResponse(BaseModel):
    """Response containing a list of tool definitions.

@ -194,11 +149,11 @@ class ToolGroups(Protocol):
        ...

    @webmethod(route="/tools", method="GET", level=LLAMA_STACK_API_V1)
-    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolsResponse:
+    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolDefsResponse:
        """List tools with optional tool group.

        :param toolgroup_id: The ID of the tool group to list tools for.
-        :returns: A ListToolsResponse.
+        :returns: A ListToolDefsResponse.
        """
        ...

@ -206,11 +161,11 @@ class ToolGroups(Protocol):
    async def get_tool(
        self,
        tool_name: str,
-    ) -> Tool:
+    ) -> ToolDef:
        """Get a tool by its name.

        :param tool_name: The name of the tool to get.
-        :returns: A Tool.
+        :returns: A ToolDef.
        """
        ...

--- a/llama_stack/apis/vector_io/vector_io.py
+++ b/llama_stack/apis/vector_io/vector_io.py
@ -512,6 +512,7 @@ class VectorIO(Protocol):
        ...

    # OpenAI Vector Stores API endpoints
+    @webmethod(route="/openai/v1/vector_stores", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/vector_stores", method="POST", level=LLAMA_STACK_API_V1)
    async def openai_create_vector_store(
        self,
@ -538,6 +539,7 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(route="/openai/v1/vector_stores", method="GET", level=LLAMA_STACK_API_V1, deprecated=True)
    @webmethod(route="/vector_stores", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_list_vector_stores(
        self,
@ -556,6 +558,9 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}", method="GET", level=LLAMA_STACK_API_V1, deprecated=True
+    )
    @webmethod(route="/vector_stores/{vector_store_id}", method="GET", level=LLAMA_STACK_API_V1)
    async def openai_retrieve_vector_store(
        self,
@ -568,6 +573,9 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}", method="POST", level=LLAMA_STACK_API_V1, deprecated=True
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}",
        method="POST",
@ -590,6 +598,9 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}",
        method="DELETE",
@ -606,6 +617,12 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/search",
+        method="POST",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/search",
        method="POST",
@ -638,6 +655,12 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files",
+        method="POST",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files",
        method="POST",
@ -660,6 +683,12 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files",
+        method="GET",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files",
        method="GET",
@ -686,6 +715,12 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+        method="GET",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}",
        method="GET",
@ -704,6 +739,12 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}/content",
+        method="GET",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}/content",
        method="GET",
@ -722,6 +763,12 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+        method="POST",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}",
        method="POST",
@ -742,6 +789,12 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/files/{file_id}",
+        method="DELETE",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/files/{file_id}",
        method="DELETE",
@ -765,6 +818,12 @@ class VectorIO(Protocol):
        method="POST",
        level=LLAMA_STACK_API_V1,
    )
+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/file_batches",
+        method="POST",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    async def openai_create_vector_store_file_batch(
        self,
        vector_store_id: str,
@ -787,6 +846,12 @@ class VectorIO(Protocol):
        method="GET",
        level=LLAMA_STACK_API_V1,
    )
+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/file_batches/{batch_id}",
+        method="GET",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    async def openai_retrieve_vector_store_file_batch(
        self,
        batch_id: str,
@ -800,6 +865,12 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files",
+        method="GET",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/files",
        method="GET",
@ -828,6 +899,12 @@ class VectorIO(Protocol):
        """
        ...

+    @webmethod(
+        route="/openai/v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel",
+        method="POST",
+        level=LLAMA_STACK_API_V1,
+        deprecated=True,
+    )
    @webmethod(
        route="/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel",
        method="POST",
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@ -6,11 +6,18 @@

 import argparse
 import os
+import ssl
 import subprocess
 from pathlib import Path

+import uvicorn
+import yaml
+
 from llama_stack.cli.stack.utils import ImageType
 from llama_stack.cli.subcommand import Subcommand
+from llama_stack.core.datatypes import LoggingConfig, StackRunConfig
+from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars, validate_env_pair
+from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
 from llama_stack.log import get_logger

 REPO_ROOT = Path(__file__).parent.parent.parent.parent
@ -146,23 +153,7 @@ class StackRun(Subcommand):
        # using the current environment packages.
        if not image_type and not image_name:
            logger.info("No image type or image name provided. Assuming environment packages.")
-            from llama_stack.core.server.server import main as server_main
-
-            # Build the server args from the current args passed to the CLI
-            server_args = argparse.Namespace()
-            for arg in vars(args):
-                # If this is a function, avoid passing it
-                # "args" contains:
-                # func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
-                if callable(getattr(args, arg)):
-                    continue
-                if arg == "config":
-                    server_args.config = str(config_file)
-                else:
-                    setattr(server_args, arg, getattr(args, arg))
-
-            # Run the server
-            server_main(server_args)
+            self._uvicorn_run(config_file, args)
        else:
            run_args = formulate_run_args(image_type, image_name)

@ -184,6 +175,76 @@ class StackRun(Subcommand):

            run_command(run_args)

+    def _uvicorn_run(self, config_file: Path | None, args: argparse.Namespace) -> None:
+        if not config_file:
+            self.parser.error("Config file is required")
+
+        # Set environment variables if provided
+        if args.env:
+            for env_pair in args.env:
+                try:
+                    key, value = validate_env_pair(env_pair)
+                    logger.info(f"Setting environment variable {key} => {value}")
+                    os.environ[key] = value
+                except ValueError as e:
+                    logger.error(f"Error: {str(e)}")
+                    self.parser.error(f"Invalid environment variable format: {env_pair}")
+
+        config_file = resolve_config_or_distro(str(config_file), Mode.RUN)
+        with open(config_file) as fp:
+            config_contents = yaml.safe_load(fp)
+            if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
+                logger_config = LoggingConfig(**cfg)
+            else:
+                logger_config = None
+            config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
+
+        port = args.port or config.server.port
+        host = config.server.host or ["::", "0.0.0.0"]
+
+        # Set the config file in environment so create_app can find it
+        os.environ["LLAMA_STACK_CONFIG"] = str(config_file)
+
+        uvicorn_config = {
+            "factory": True,
+            "host": host,
+            "port": port,
+            "lifespan": "on",
+            "log_level": logger.getEffectiveLevel(),
+            "log_config": logger_config,
+        }
+
+        keyfile = config.server.tls_keyfile
+        certfile = config.server.tls_certfile
+        if keyfile and certfile:
+            uvicorn_config["ssl_keyfile"] = config.server.tls_keyfile
+            uvicorn_config["ssl_certfile"] = config.server.tls_certfile
+            if config.server.tls_cafile:
+                uvicorn_config["ssl_ca_certs"] = config.server.tls_cafile
+                uvicorn_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
+
+            logger.info(
+                f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}\n  CA: {config.server.tls_cafile}"
+            )
+        else:
+            logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
+
+        logger.info(f"Listening on {host}:{port}")
+
+        # We need to catch KeyboardInterrupt because uvicorn's signal handling
+        # re-raises SIGINT signals using signal.raise_signal(), which Python
+        # converts to KeyboardInterrupt. Without this catch, we'd get a confusing
+        # stack trace when using Ctrl+C or kill -2 (SIGINT).
+        # SIGTERM (kill -15) works fine without this because Python doesn't
+        # have a default handler for it.
+        #
+        # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
+        # signal handling but this is quite intrusive and not worth the effort.
+        try:
+            uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)
+        except (KeyboardInterrupt, SystemExit):
+            logger.info("Received interrupt signal, shutting down gracefully...")
+
    def _start_ui_development_server(self, stack_server_port: int):
        logger.info("Attempting to start UI development server...")
        # Check if npm is available
--- a/llama_stack/core/build_container.sh
+++ b/llama_stack/core/build_container.sh
@ -324,14 +324,14 @@ fi
 RUN pip uninstall -y uv
 EOF

-# If a run config is provided, we use the --config flag
+# If a run config is provided, we use the llama stack CLI
 if [[ -n "$run_config" ]]; then
  add_to_container << EOF
-ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$RUN_CONFIG_PATH"]
+ENTRYPOINT ["llama", "stack", "run", "$RUN_CONFIG_PATH"]
 EOF
 elif [[ "$distro_or_config" != *.yaml ]]; then
  add_to_container << EOF
-ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$distro_or_config"]
+ENTRYPOINT ["llama", "stack", "run", "$distro_or_config"]
 EOF
 fi

--- a/llama_stack/core/conversations/init.py
+++ b/llama_stack/core/conversations/init.py
@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
--- a/llama_stack/core/conversations/conversations.py
+++ b/llama_stack/core/conversations/conversations.py
@ -0,0 +1,306 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import os
+import secrets
+import time
+from typing import Any
+
+from openai import NOT_GIVEN
+from pydantic import BaseModel, TypeAdapter
+
+from llama_stack.apis.conversations.conversations import (
+    Conversation,
+    ConversationDeletedResource,
+    ConversationItem,
+    ConversationItemDeletedResource,
+    ConversationItemList,
+    Conversations,
+    Metadata,
+)
+from llama_stack.core.datatypes import AccessRule
+from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
+from llama_stack.log import get_logger
+from llama_stack.providers.utils.sqlstore.api import ColumnDefinition, ColumnType
+from llama_stack.providers.utils.sqlstore.authorized_sqlstore import AuthorizedSqlStore
+from llama_stack.providers.utils.sqlstore.sqlstore import (
+    SqliteSqlStoreConfig,
+    SqlStoreConfig,
+    sqlstore_impl,
+)
+
+logger = get_logger(name=__name__, category="openai::conversations")
+
+
+class ConversationServiceConfig(BaseModel):
+    """Configuration for the built-in conversation service.
+
+    :param conversations_store: SQL store configuration for conversations (defaults to SQLite)
+    :param policy: Access control rules
+    """
+
+    conversations_store: SqlStoreConfig = SqliteSqlStoreConfig(
+        db_path=(DISTRIBS_BASE_DIR / "conversations.db").as_posix()
+    )
+    policy: list[AccessRule] = []
+
+
+async def get_provider_impl(config: ConversationServiceConfig, deps: dict[Any, Any]):
+    """Get the conversation service implementation."""
+    impl = ConversationServiceImpl(config, deps)
+    await impl.initialize()
+    return impl
+
+
+class ConversationServiceImpl(Conversations):
+    """Built-in conversation service implementation using AuthorizedSqlStore."""
+
+    def __init__(self, config: ConversationServiceConfig, deps: dict[Any, Any]):
+        self.config = config
+        self.deps = deps
+        self.policy = config.policy
+
+        base_sql_store = sqlstore_impl(config.conversations_store)
+        self.sql_store = AuthorizedSqlStore(base_sql_store, self.policy)
+
+    async def initialize(self) -> None:
+        """Initialize the store and create tables."""
+        if isinstance(self.config.conversations_store, SqliteSqlStoreConfig):
+            os.makedirs(os.path.dirname(self.config.conversations_store.db_path), exist_ok=True)
+
+        await self.sql_store.create_table(
+            "openai_conversations",
+            {
+                "id": ColumnDefinition(type=ColumnType.STRING, primary_key=True),
+                "created_at": ColumnType.INTEGER,
+                "items": ColumnType.JSON,
+                "metadata": ColumnType.JSON,
+            },
+        )
+
+        await self.sql_store.create_table(
+            "conversation_items",
+            {
+                "id": ColumnDefinition(type=ColumnType.STRING, primary_key=True),
+                "conversation_id": ColumnType.STRING,
+                "created_at": ColumnType.INTEGER,
+                "item_data": ColumnType.JSON,
+            },
+        )
+
+    async def create_conversation(
+        self, items: list[ConversationItem] | None = None, metadata: Metadata | None = None
+    ) -> Conversation:
+        """Create a conversation."""
+        random_bytes = secrets.token_bytes(24)
+        conversation_id = f"conv_{random_bytes.hex()}"
+        created_at = int(time.time())
+
+        record_data = {
+            "id": conversation_id,
+            "created_at": created_at,
+            "items": [],
+            "metadata": metadata,
+        }
+
+        await self.sql_store.insert(
+            table="openai_conversations",
+            data=record_data,
+        )
+
+        if items:
+            item_records = []
+            for item in items:
+                item_dict = item.model_dump()
+                item_id = self._get_or_generate_item_id(item, item_dict)
+
+                item_record = {
+                    "id": item_id,
+                    "conversation_id": conversation_id,
+                    "created_at": created_at,
+                    "item_data": item_dict,
+                }
+
+                item_records.append(item_record)
+
+            await self.sql_store.insert(table="conversation_items", data=item_records)
+
+        conversation = Conversation(
+            id=conversation_id,
+            created_at=created_at,
+            metadata=metadata,
+            object="conversation",
+        )
+
+        logger.info(f"Created conversation {conversation_id}")
+        return conversation
+
+    async def get_conversation(self, conversation_id: str) -> Conversation:
+        """Get a conversation with the given ID."""
+        record = await self.sql_store.fetch_one(table="openai_conversations", where={"id": conversation_id})
+
+        if record is None:
+            raise ValueError(f"Conversation {conversation_id} not found")
+
+        return Conversation(
+            id=record["id"], created_at=record["created_at"], metadata=record.get("metadata"), object="conversation"
+        )
+
+    async def update_conversation(self, conversation_id: str, metadata: Metadata) -> Conversation:
+        """Update a conversation's metadata with the given ID"""
+        await self.sql_store.update(
+            table="openai_conversations", data={"metadata": metadata}, where={"id": conversation_id}
+        )
+
+        return await self.get_conversation(conversation_id)
+
+    async def openai_delete_conversation(self, conversation_id: str) -> ConversationDeletedResource:
+        """Delete a conversation with the given ID."""
+        await self.sql_store.delete(table="openai_conversations", where={"id": conversation_id})
+
+        logger.info(f"Deleted conversation {conversation_id}")
+        return ConversationDeletedResource(id=conversation_id)
+
+    def _validate_conversation_id(self, conversation_id: str) -> None:
+        """Validate conversation ID format."""
+        if not conversation_id.startswith("conv_"):
+            raise ValueError(
+                f"Invalid 'conversation_id': '{conversation_id}'. Expected an ID that begins with 'conv_'."
+            )
+
+    def _get_or_generate_item_id(self, item: ConversationItem, item_dict: dict) -> str:
+        """Get existing item ID or generate one if missing."""
+        if item.id is None:
+            random_bytes = secrets.token_bytes(24)
+            if item.type == "message":
+                item_id = f"msg_{random_bytes.hex()}"
+            else:
+                item_id = f"item_{random_bytes.hex()}"
+            item_dict["id"] = item_id
+            return item_id
+        return item.id
+
+    async def _get_validated_conversation(self, conversation_id: str) -> Conversation:
+        """Validate conversation ID and return the conversation if it exists."""
+        self._validate_conversation_id(conversation_id)
+        return await self.get_conversation(conversation_id)
+
+    async def add_items(self, conversation_id: str, items: list[ConversationItem]) -> ConversationItemList:
+        """Create (add) items to a conversation."""
+        await self._get_validated_conversation(conversation_id)
+
+        created_items = []
+        created_at = int(time.time())
+
+        for item in items:
+            item_dict = item.model_dump()
+            item_id = self._get_or_generate_item_id(item, item_dict)
+
+            item_record = {
+                "id": item_id,
+                "conversation_id": conversation_id,
+                "created_at": created_at,
+                "item_data": item_dict,
+            }
+
+            # TODO: Add support for upsert in sql_store, this will fail first if ID exists and then update
+            try:
+                await self.sql_store.insert(table="conversation_items", data=item_record)
+            except Exception:
+                # If insert fails due to ID conflict, update existing record
+                await self.sql_store.update(
+                    table="conversation_items",
+                    data={"created_at": created_at, "item_data": item_dict},
+                    where={"id": item_id},
+                )
+
+            created_items.append(item_dict)
+
+        logger.info(f"Created {len(created_items)} items in conversation {conversation_id}")
+
+        # Convert created items (dicts) to proper ConversationItem types
+        adapter: TypeAdapter[ConversationItem] = TypeAdapter(ConversationItem)
+        response_items: list[ConversationItem] = [adapter.validate_python(item_dict) for item_dict in created_items]
+
+        return ConversationItemList(
+            data=response_items,
+            first_id=created_items[0]["id"] if created_items else None,
+            last_id=created_items[-1]["id"] if created_items else None,
+            has_more=False,
+        )
+
+    async def retrieve(self, conversation_id: str, item_id: str) -> ConversationItem:
+        """Retrieve a conversation item."""
+        if not conversation_id:
+            raise ValueError(f"Expected a non-empty value for `conversation_id` but received {conversation_id!r}")
+        if not item_id:
+            raise ValueError(f"Expected a non-empty value for `item_id` but received {item_id!r}")
+
+        # Get item from conversation_items table
+        record = await self.sql_store.fetch_one(
+            table="conversation_items", where={"id": item_id, "conversation_id": conversation_id}
+        )
+
+        if record is None:
+            raise ValueError(f"Item {item_id} not found in conversation {conversation_id}")
+
+        adapter: TypeAdapter[ConversationItem] = TypeAdapter(ConversationItem)
+        return adapter.validate_python(record["item_data"])
+
+    async def list(self, conversation_id: str, after=NOT_GIVEN, include=NOT_GIVEN, limit=NOT_GIVEN, order=NOT_GIVEN):
+        """List items in the conversation."""
+        result = await self.sql_store.fetch_all(table="conversation_items", where={"conversation_id": conversation_id})
+        records = result.data
+
+        if order != NOT_GIVEN and order == "asc":
+            records.sort(key=lambda x: x["created_at"])
+        else:
+            records.sort(key=lambda x: x["created_at"], reverse=True)
+
+        actual_limit = 20
+        if limit != NOT_GIVEN and isinstance(limit, int):
+            actual_limit = limit
+
+        records = records[:actual_limit]
+        items = [record["item_data"] for record in records]
+
+        adapter: TypeAdapter[ConversationItem] = TypeAdapter(ConversationItem)
+        response_items: list[ConversationItem] = [adapter.validate_python(item) for item in items]
+
+        first_id = response_items[0].id if response_items else None
+        last_id = response_items[-1].id if response_items else None
+
+        return ConversationItemList(
+            data=response_items,
+            first_id=first_id,
+            last_id=last_id,
+            has_more=False,
+        )
+
+    async def openai_delete_conversation_item(
+        self, conversation_id: str, item_id: str
+    ) -> ConversationItemDeletedResource:
+        """Delete a conversation item."""
+        if not conversation_id:
+            raise ValueError(f"Expected a non-empty value for `conversation_id` but received {conversation_id!r}")
+        if not item_id:
+            raise ValueError(f"Expected a non-empty value for `item_id` but received {item_id!r}")
+
+        _ = await self._get_validated_conversation(conversation_id)
+
+        record = await self.sql_store.fetch_one(
+            table="conversation_items", where={"id": item_id, "conversation_id": conversation_id}
+        )
+
+        if record is None:
+            raise ValueError(f"Item {item_id} not found in conversation {conversation_id}")
+
+        await self.sql_store.delete(
+            table="conversation_items", where={"id": item_id, "conversation_id": conversation_id}
+        )
+
+        logger.info(f"Deleted item {item_id} from conversation {conversation_id}")
+        return ConversationItemDeletedResource(id=item_id)
--- a/llama_stack/core/datatypes.py
+++ b/llama_stack/core/datatypes.py
@ -22,7 +22,7 @@ from llama_stack.apis.safety import Safety
 from llama_stack.apis.scoring import Scoring
 from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnInput
 from llama_stack.apis.shields import Shield, ShieldInput
-from llama_stack.apis.tools import Tool, ToolGroup, ToolGroupInput, ToolRuntime
+from llama_stack.apis.tools import ToolGroup, ToolGroupInput, ToolRuntime
 from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.core.access_control.datatypes import AccessRule
@ -84,15 +84,11 @@ class BenchmarkWithOwner(Benchmark, ResourceWithOwner):
    pass


-class ToolWithOwner(Tool, ResourceWithOwner):
-    pass
-
-
 class ToolGroupWithOwner(ToolGroup, ResourceWithOwner):
    pass


-RoutableObject = Model | Shield | VectorDB | Dataset | ScoringFn | Benchmark | Tool | ToolGroup
+RoutableObject = Model | Shield | VectorDB | Dataset | ScoringFn | Benchmark | ToolGroup

 RoutableObjectWithProvider = Annotated[
    ModelWithOwner
@ -101,7 +97,6 @@ RoutableObjectWithProvider = Annotated[
    | DatasetWithOwner
    | ScoringFnWithOwner
    | BenchmarkWithOwner
-    | ToolWithOwner
    | ToolGroupWithOwner,
    Field(discriminator="type"),
 ]
@ -480,6 +475,13 @@ InferenceStoreConfig (with queue tuning parameters) or a SqlStoreConfig (depreca
 If not specified, a default SQLite store will be used.""",
    )

+    conversations_store: SqlStoreConfig | None = Field(
+        default=None,
+        description="""
+Configuration for the persistence store used by the conversations API.
+If not specified, a default SQLite store will be used.""",
+    )
+
    # registry of "resources" in the distribution
    models: list[ModelInput] = Field(default_factory=list)
    shields: list[ShieldInput] = Field(default_factory=list)
--- a/llama_stack/core/distribution.py
+++ b/llama_stack/core/distribution.py
@ -25,7 +25,7 @@ from llama_stack.providers.datatypes import (
 logger = get_logger(name=__name__, category="core")


-INTERNAL_APIS = {Api.inspect, Api.providers, Api.prompts}
+INTERNAL_APIS = {Api.inspect, Api.providers, Api.prompts, Api.conversations}


 def stack_apis() -> list[Api]:
@ -243,6 +243,7 @@ def get_external_providers_from_module(
                    spec = module.get_provider_spec()
                else:
                    # pass in a partially filled out provider spec to satisfy the registry -- knowing we will be overwriting it later upon build and run
+                    # in the case we are building we CANNOT import this module of course because it has not been installed.
                    spec = ProviderSpec(
                        api=Api(provider_api),
                        provider_type=provider.provider_type,
@ -251,9 +252,20 @@ def get_external_providers_from_module(
                        config_class="",
                    )
                provider_type = provider.provider_type
-                # in the case we are building we CANNOT import this module of course because it has not been installed.
-                # return a partially filled out spec that the build script will populate.
-                registry[Api(provider_api)][provider_type] = spec
+                if isinstance(spec, list):
+                    # optionally allow people to pass inline and remote provider specs as a returned list.
+                    # with the old method, users could pass in directories of specs using overlapping code
+                    # we want to ensure we preserve that flexibility in this method.
+                    logger.info(
+                        f"Detected a list of external provider specs from {provider.module} adding all to the registry"
+                    )
+                    for provider_spec in spec:
+                        if provider_spec.provider_type != provider.provider_type:
+                            continue
+                        logger.info(f"Adding {provider.provider_type} to registry")
+                        registry[Api(provider_api)][provider.provider_type] = provider_spec
+                else:
+                    registry[Api(provider_api)][provider_type] = spec
            except ModuleNotFoundError as exc:
                raise ValueError(
                    "get_provider_spec not found. If specifying an external provider via `module` in the Provider spec, the Provider must have the `provider.get_provider_spec` module available"
--- a/llama_stack/core/library_client.py
+++ b/llama_stack/core/library_client.py
@ -374,6 +374,10 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
        body = options.params or {}
        body |= options.json_data or {}

+        # Merge extra_json parameters (extra_body from SDK is converted to extra_json)
+        if hasattr(options, "extra_json") and options.extra_json:
+            body |= options.extra_json
+
        matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
        body |= path_params

--- a/llama_stack/core/resolver.py
+++ b/llama_stack/core/resolver.py
@ -10,6 +10,7 @@ from typing import Any
 from llama_stack.apis.agents import Agents
 from llama_stack.apis.batches import Batches
 from llama_stack.apis.benchmarks import Benchmarks
+from llama_stack.apis.conversations import Conversations
 from llama_stack.apis.datasetio import DatasetIO
 from llama_stack.apis.datasets import Datasets
 from llama_stack.apis.datatypes import ExternalApiSpec
@ -96,6 +97,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
        Api.tool_runtime: ToolRuntime,
        Api.files: Files,
        Api.prompts: Prompts,
+        Api.conversations: Conversations,
    }

    if external_apis:
--- a/llama_stack/core/routers/inference.py
+++ b/llama_stack/core/routers/inference.py
@ -27,7 +27,6 @@ from llama_stack.apis.inference import (
    CompletionResponseStreamChunk,
    Inference,
    ListOpenAIChatCompletionResponse,
-    LogProbConfig,
    Message,
    OpenAIAssistantMessageParam,
    OpenAIChatCompletion,
@ -42,12 +41,7 @@ from llama_stack.apis.inference import (
    OpenAIMessageParam,
    OpenAIResponseFormatParam,
    Order,
-    ResponseFormat,
-    SamplingParams,
    StopReason,
-    ToolChoice,
-    ToolConfig,
-    ToolDefinition,
    ToolPromptFormat,
 )
 from llama_stack.apis.models import Model, ModelType
@ -185,129 +179,6 @@ class InferenceRouter(Inference):
            raise ModelTypeError(model_id, model.model_type, expected_model_type)
        return model

-    async def chat_completion(
-        self,
-        model_id: str,
-        messages: list[Message],
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        tools: list[ToolDefinition] | None = None,
-        tool_choice: ToolChoice | None = None,
-        tool_prompt_format: ToolPromptFormat | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-        tool_config: ToolConfig | None = None,
-    ) -> ChatCompletionResponse | AsyncIterator[ChatCompletionResponseStreamChunk]:
-        logger.debug(
-            f"InferenceRouter.chat_completion: {model_id=}, {stream=}, {messages=}, {tools=}, {tool_config=}, {response_format=}",
-        )
-        if sampling_params is None:
-            sampling_params = SamplingParams()
-        model = await self._get_model(model_id, ModelType.llm)
-        if tool_config:
-            if tool_choice and tool_choice != tool_config.tool_choice:
-                raise ValueError("tool_choice and tool_config.tool_choice must match")
-            if tool_prompt_format and tool_prompt_format != tool_config.tool_prompt_format:
-                raise ValueError("tool_prompt_format and tool_config.tool_prompt_format must match")
-        else:
-            params = {}
-            if tool_choice:
-                params["tool_choice"] = tool_choice
-            if tool_prompt_format:
-                params["tool_prompt_format"] = tool_prompt_format
-            tool_config = ToolConfig(**params)
-
-        tools = tools or []
-        if tool_config.tool_choice == ToolChoice.none:
-            tools = []
-        elif tool_config.tool_choice == ToolChoice.auto:
-            pass
-        elif tool_config.tool_choice == ToolChoice.required:
-            pass
-        else:
-            # verify tool_choice is one of the tools
-            tool_names = [t.tool_name if isinstance(t.tool_name, str) else t.tool_name.value for t in tools]
-            if tool_config.tool_choice not in tool_names:
-                raise ValueError(f"Tool choice {tool_config.tool_choice} is not one of the tools: {tool_names}")
-
-        params = dict(
-            model_id=model_id,
-            messages=messages,
-            sampling_params=sampling_params,
-            tools=tools,
-            tool_choice=tool_choice,
-            tool_prompt_format=tool_prompt_format,
-            response_format=response_format,
-            stream=stream,
-            logprobs=logprobs,
-            tool_config=tool_config,
-        )
-        provider = await self.routing_table.get_provider_impl(model_id)
-        prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format)
-
-        if stream:
-            response_stream = await provider.chat_completion(**params)
-            return self.stream_tokens_and_compute_metrics(
-                response=response_stream,
-                prompt_tokens=prompt_tokens,
-                model=model,
-                tool_prompt_format=tool_config.tool_prompt_format,
-            )
-
-        response = await provider.chat_completion(**params)
-        metrics = await self.count_tokens_and_compute_metrics(
-            response=response,
-            prompt_tokens=prompt_tokens,
-            model=model,
-            tool_prompt_format=tool_config.tool_prompt_format,
-        )
-        # these metrics will show up in the client response.
-        response.metrics = (
-            metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
-        )
-        return response
-
-    async def completion(
-        self,
-        model_id: str,
-        content: InterleavedContent,
-        sampling_params: SamplingParams | None = None,
-        response_format: ResponseFormat | None = None,
-        stream: bool | None = False,
-        logprobs: LogProbConfig | None = None,
-    ) -> AsyncGenerator:
-        if sampling_params is None:
-            sampling_params = SamplingParams()
-        logger.debug(
-            f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}",
-        )
-        model = await self._get_model(model_id, ModelType.llm)
-        provider = await self.routing_table.get_provider_impl(model_id)
-        params = dict(
-            model_id=model_id,
-            content=content,
-            sampling_params=sampling_params,
-            response_format=response_format,
-            stream=stream,
-            logprobs=logprobs,
-        )
-
-        prompt_tokens = await self._count_tokens(content)
-        response = await provider.completion(**params)
-        if stream:
-            return self.stream_tokens_and_compute_metrics(
-                response=response,
-                prompt_tokens=prompt_tokens,
-                model=model,
-            )
-
-        metrics = await self.count_tokens_and_compute_metrics(
-            response=response, prompt_tokens=prompt_tokens, model=model
-        )
-        response.metrics = metrics if response.metrics is None else response.metrics + metrics
-
-        return response
-
    async def openai_completion(
        self,
        model: str,
--- a/llama_stack/core/routers/tool_runtime.py
+++ b/llama_stack/core/routers/tool_runtime.py
@ -11,7 +11,7 @@ from llama_stack.apis.common.content_types import (
    InterleavedContent,
 )
 from llama_stack.apis.tools import (
-    ListToolsResponse,
+    ListToolDefsResponse,
    RAGDocument,
    RAGQueryConfig,
    RAGQueryResult,
@ -86,6 +86,6 @@ class ToolRuntimeRouter(ToolRuntime):

    async def list_runtime_tools(
        self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
-    ) -> ListToolsResponse:
+    ) -> ListToolDefsResponse:
        logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
        return await self.routing_table.list_tools(tool_group_id)
--- a/llama_stack/core/routing_tables/toolgroups.py
+++ b/llama_stack/core/routing_tables/toolgroups.py
@ -8,7 +8,7 @@ from typing import Any

 from llama_stack.apis.common.content_types import URL
 from llama_stack.apis.common.errors import ToolGroupNotFoundError
-from llama_stack.apis.tools import ListToolGroupsResponse, ListToolsResponse, Tool, ToolGroup, ToolGroups
+from llama_stack.apis.tools import ListToolDefsResponse, ListToolGroupsResponse, ToolDef, ToolGroup, ToolGroups
 from llama_stack.core.datatypes import AuthenticationRequiredError, ToolGroupWithOwner
 from llama_stack.log import get_logger

@ -27,7 +27,7 @@ def parse_toolgroup_from_toolgroup_name_pair(toolgroup_name_with_maybe_tool_name


 class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
-    toolgroups_to_tools: dict[str, list[Tool]] = {}
+    toolgroups_to_tools: dict[str, list[ToolDef]] = {}
    tool_to_toolgroup: dict[str, str] = {}

    # overridden
@ -43,7 +43,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
            routing_key = self.tool_to_toolgroup[routing_key]
        return await super().get_provider_impl(routing_key, provider_id)

-    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolsResponse:
+    async def list_tools(self, toolgroup_id: str | None = None) -> ListToolDefsResponse:
        if toolgroup_id:
            if group_id := parse_toolgroup_from_toolgroup_name_pair(toolgroup_id):
                toolgroup_id = group_id
@ -68,30 +68,19 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
                    continue
            all_tools.extend(self.toolgroups_to_tools[toolgroup.identifier])

-        return ListToolsResponse(data=all_tools)
+        return ListToolDefsResponse(data=all_tools)

    async def _index_tools(self, toolgroup: ToolGroup):
        provider_impl = await super().get_provider_impl(toolgroup.identifier, toolgroup.provider_id)
        tooldefs_response = await provider_impl.list_runtime_tools(toolgroup.identifier, toolgroup.mcp_endpoint)

-        # TODO: kill this Tool vs ToolDef distinction
        tooldefs = tooldefs_response.data
-        tools = []
        for t in tooldefs:
-            tools.append(
-                Tool(
-                    identifier=t.name,
-                    toolgroup_id=toolgroup.identifier,
-                    description=t.description or "",
-                    parameters=t.parameters or [],
-                    metadata=t.metadata,
-                    provider_id=toolgroup.provider_id,
-                )
-            )
+            t.toolgroup_id = toolgroup.identifier

-        self.toolgroups_to_tools[toolgroup.identifier] = tools
-        for tool in tools:
-            self.tool_to_toolgroup[tool.identifier] = toolgroup.identifier
+        self.toolgroups_to_tools[toolgroup.identifier] = tooldefs
+        for tool in tooldefs:
+            self.tool_to_toolgroup[tool.name] = toolgroup.identifier

    async def list_tool_groups(self) -> ListToolGroupsResponse:
        return ListToolGroupsResponse(data=await self.get_all_with_type("tool_group"))
@ -102,12 +91,12 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
            raise ToolGroupNotFoundError(toolgroup_id)
        return tool_group

-    async def get_tool(self, tool_name: str) -> Tool:
+    async def get_tool(self, tool_name: str) -> ToolDef:
        if tool_name in self.tool_to_toolgroup:
            toolgroup_id = self.tool_to_toolgroup[tool_name]
            tools = self.toolgroups_to_tools[toolgroup_id]
            for tool in tools:
-                if tool.identifier == tool_name:
+                if tool.name == tool_name:
                    return tool
        raise ValueError(f"Tool '{tool_name}' not found")

@ -132,7 +121,6 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
        # baked in some of the code and tests right now.
        if not toolgroup.mcp_endpoint:
            await self._index_tools(toolgroup)
-        return toolgroup

    async def unregister_toolgroup(self, toolgroup_id: str) -> None:
        await self.unregister_object(await self.get_tool_group(toolgroup_id))
--- a/llama_stack/core/server/server.py
+++ b/llama_stack/core/server/server.py
@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-import argparse
 import asyncio
 import concurrent.futures
 import functools
@ -12,7 +11,6 @@ import inspect
 import json
 import logging  # allow-direct-logging
 import os
-import ssl
 import sys
 import traceback
 import warnings
@ -35,7 +33,6 @@ from pydantic import BaseModel, ValidationError

 from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
 from llama_stack.apis.common.responses import PaginatedResponse
-from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
 from llama_stack.core.access_control.access_control import AccessDeniedError
 from llama_stack.core.datatypes import (
    AuthenticationRequiredError,
@ -55,7 +52,6 @@ from llama_stack.core.stack import (
    Stack,
    cast_image_name_to_string,
    replace_env_vars,
-    validate_env_pair,
 )
 from llama_stack.core.utils.config import redact_sensitive_fields
 from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
@ -257,7 +253,7 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:

                    return result
            except Exception as e:
-                if logger.isEnabledFor(logging.DEBUG):
+                if logger.isEnabledFor(logging.INFO):
                    logger.exception(f"Error executing endpoint {route=} {method=}")
                else:
                    logger.error(f"Error executing endpoint {route=} {method=}: {str(e)}")
@ -333,23 +329,18 @@ class ClientVersionMiddleware:
        return await self.app(scope, receive, send)


-def create_app(
-    config_file: str | None = None,
-    env_vars: list[str] | None = None,
-) -> StackApp:
+def create_app() -> StackApp:
    """Create and configure the FastAPI application.

-    Args:
-        config_file: Path to config file. If None, uses LLAMA_STACK_CONFIG env var or default resolution.
-        env_vars: List of environment variables in KEY=value format.
-        disable_version_check: Whether to disable version checking. If None, uses LLAMA_STACK_DISABLE_VERSION_CHECK env var.
+    This factory function reads configuration from environment variables:
+    - LLAMA_STACK_CONFIG: Path to config file (required)

    Returns:
        Configured StackApp instance.
    """
-    config_file = config_file or os.getenv("LLAMA_STACK_CONFIG")
+    config_file = os.getenv("LLAMA_STACK_CONFIG")
    if config_file is None:
-        raise ValueError("No config file provided and LLAMA_STACK_CONFIG env var is not set")
+        raise ValueError("LLAMA_STACK_CONFIG environment variable is required")

    config_file = resolve_config_or_distro(config_file, Mode.RUN)

@ -361,16 +352,6 @@ def create_app(
            logger_config = LoggingConfig(**cfg)
        logger = get_logger(name=__name__, category="core::server", config=logger_config)

-        if env_vars:
-            for env_pair in env_vars:
-                try:
-                    key, value = validate_env_pair(env_pair)
-                    logger.info(f"Setting environment variable {key} => {value}")
-                    os.environ[key] = value
-                except ValueError as e:
-                    logger.error(f"Error: {str(e)}")
-                    raise ValueError(f"Invalid environment variable format: {env_pair}") from e
-
        config = replace_env_vars(config_contents)
        config = StackRunConfig(**cast_image_name_to_string(config))

@ -451,6 +432,7 @@ def create_app(
    apis_to_serve.add("inspect")
    apis_to_serve.add("providers")
    apis_to_serve.add("prompts")
+    apis_to_serve.add("conversations")
    for api_str in apis_to_serve:
        api = Api(api_str)

@ -493,101 +475,6 @@ def create_app(
    return app


-def main(args: argparse.Namespace | None = None):
-    """Start the LlamaStack server."""
-    parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
-
-    add_config_distro_args(parser)
-    parser.add_argument(
-        "--port",
-        type=int,
-        default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
-        help="Port to listen on",
-    )
-    parser.add_argument(
-        "--env",
-        action="append",
-        help="Environment variables in KEY=value format. Can be specified multiple times.",
-    )
-
-    # Determine whether the server args are being passed by the "run" command, if this is the case
-    # the args will be passed as a Namespace object to the main function, otherwise they will be
-    # parsed from the command line
-    if args is None:
-        args = parser.parse_args()
-
-    config_or_distro = get_config_from_args(args)
-
-    try:
-        app = create_app(
-            config_file=config_or_distro,
-            env_vars=args.env,
-        )
-    except Exception as e:
-        logger.error(f"Error creating app: {str(e)}")
-        sys.exit(1)
-
-    config_file = resolve_config_or_distro(config_or_distro, Mode.RUN)
-    with open(config_file) as fp:
-        config_contents = yaml.safe_load(fp)
-        if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
-            logger_config = LoggingConfig(**cfg)
-        else:
-            logger_config = None
-        config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
-
-    import uvicorn
-
-    # Configure SSL if certificates are provided
-    port = args.port or config.server.port
-
-    ssl_config = None
-    keyfile = config.server.tls_keyfile
-    certfile = config.server.tls_certfile
-
-    if keyfile and certfile:
-        ssl_config = {
-            "ssl_keyfile": keyfile,
-            "ssl_certfile": certfile,
-        }
-        if config.server.tls_cafile:
-            ssl_config["ssl_ca_certs"] = config.server.tls_cafile
-            ssl_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
-            logger.info(
-                f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}\n  CA: {config.server.tls_cafile}"
-            )
-        else:
-            logger.info(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
-
-    listen_host = config.server.host or ["::", "0.0.0.0"]
-    logger.info(f"Listening on {listen_host}:{port}")
-
-    uvicorn_config = {
-        "app": app,
-        "host": listen_host,
-        "port": port,
-        "lifespan": "on",
-        "log_level": logger.getEffectiveLevel(),
-        "log_config": logger_config,
-    }
-    if ssl_config:
-        uvicorn_config.update(ssl_config)
-
-    # We need to catch KeyboardInterrupt because uvicorn's signal handling
-    # re-raises SIGINT signals using signal.raise_signal(), which Python
-    # converts to KeyboardInterrupt. Without this catch, we'd get a confusing
-    # stack trace when using Ctrl+C or kill -2 (SIGINT).
-    # SIGTERM (kill -15) works fine without this because Python doesn't
-    # have a default handler for it.
-    #
-    # Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
-    # signal handling but this is quite intrusive and not worth the effort.
-    try:
-        asyncio.run(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
-    except (KeyboardInterrupt, SystemExit):
-        logger.info("Received interrupt signal, shutting down gracefully...")
-
-
 def _log_run_config(run_config: StackRunConfig):
    """Logs the run config with redacted fields and disabled providers removed."""
    logger.info("Run configuration:")
@ -614,7 +501,3 @@ def remove_disabled_providers(obj):
        return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None]
    else:
        return obj
-
-
-if __name__ == "__main__":
-    main()
--- a/Show more
+++ b/Show more