RHAIENG-565: purge the midstream repo content to only host the build artifacts, so only the redhat-distribution should remain

2025-12-17 22:27:15 +00:00 · 2025-08-12 12:54:32 +01:00 · 2025-08-12 12:54:32 +01:00 · db484734b4
commit db484734b4
parent 9803329350
1126 changed files with 0 additions and 526647 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -1,6 +0,0 @@
 [run]
 omit =
    */tests/*
    */llama_stack/providers/*
    */llama_stack/templates/*
    .venv/*
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -1,5 +0,0 @@
 # Each line is a file pattern followed by one or more owners.
 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence,
 * @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist
--- a/.github/ISSUE_TEMPLATE/bug.yml
+++ b/.github/ISSUE_TEMPLATE/bug.yml
@ -1,77 +0,0 @@
 name: 🐛 Bug Report
 description: Create a report to help us reproduce and fix the bug
 labels: ["bug"]
 body:
  - type: markdown
    attributes:
      value: >
        #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
        existing and past issues](https://github.com/meta-llama/llama-stack/issues).
  - type: textarea
    id: system-info
    attributes:
      label: System Info
      description: |
        Please share your system info with us. You can use the following command to capture your environment information
        python -m "torch.utils.collect_env"
      placeholder: |
        PyTorch version, CUDA version, GPU type, #num of GPUs...
    validations:
      required: true
  - type: checkboxes
    id: information-scripts-examples
    attributes:
      label: Information
      description: 'The problem arises when using:'
      options:
        - label: "The official example scripts"
        - label: "My own modified scripts"
  - type: textarea
    id: bug-description
    attributes:
      label: 🐛 Describe the bug
      description: |
        Please provide a clear and concise description of what the bug is.
        Please also paste or describe the results you observe instead of the expected results.
      placeholder: |
        A clear and concise description of what the bug is.
        ```llama stack
        # Command that you used for running the examples
        ```
        Description of the results
    validations:
      required: true
  - type: textarea
    attributes:
      label: Error logs
      description: |
       If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
      placeholder: |
        ```
        The error message you got, with the full traceback.
        ```
    validations:
      required: true
  - type: textarea
    id: expected-behavior
    validations:
      required: true
    attributes:
      label: Expected behavior
      description: "A clear and concise description of what you would expect to happen."
  - type: markdown
    attributes:
      value: >
        Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,12 +0,0 @@
 blank_issues_enabled: false
 contact_links:
  - name: Have you read the docs?
    url: https://llama-stack.readthedocs.io/en/latest/index.html
    about: Much help can be found in the docs
  - name: Start a discussion
    url: https://github.com/meta-llama/llama-stack/discussions/new
    about: Start a discussion on a topic
  - name: Chat on Discord
    url: https://discord.gg/llama-stack
    about: Maybe chatting with the community can help
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@ -1,28 +0,0 @@
 name: 🚀 Feature request
 description: Request a new llama-stack feature
 labels: ["enhancement"]
 body:
 - type: textarea
  id: feature-pitch
  attributes:
    label: 🚀 Describe the new functionality needed
    description: >
      A clear and concise description of _what_ needs to be built.
  validations:
    required: true
 - type: textarea
  id: feature-motivation
  attributes:
    label: 💡 Why is this needed? What if we don't build it?
    description: >
      A clear and concise description of _why_ this functionality is needed.
  validations:
    required: true
 - type: textarea
  id: other-thoughts
  attributes:
    label: Other thoughts
    description: >
      Any thoughts about how this may result in complexity in the codebase, or other trade-offs.
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,8 +0,0 @@
 # What does this PR do?
 <!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. -->
 <!-- If resolving an issue, uncomment and update the line below -->
 <!-- Closes #[issue-number] -->
 ## Test Plan
 <!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
--- a/.github/TRIAGERS.md
+++ b/.github/TRIAGERS.md
@ -1,2 +0,0 @@
 # This file documents Triage members in the Llama Stack community
 @bbrowning @booxter @franciscojavierarceo @leseb
--- a/.github/actions/setup-ollama/action.yml
+++ b/.github/actions/setup-ollama/action.yml
@ -1,9 +0,0 @@
 name: Setup Ollama
 description: Start Ollama
 runs:
  using: "composite"
  steps:
    - name: Start Ollama
      shell: bash
      run: |
        docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
--- a/.github/actions/setup-runner/action.yml
+++ b/.github/actions/setup-runner/action.yml
@ -1,27 +0,0 @@
 name: Setup runner
 description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
 inputs:
  python-version:
    description: The Python version to use
    required: false
    default: "3.10"
 runs:
  using: "composite"
  steps:
    - name: Install uv
      uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
      with:
        python-version: ${{ inputs.python-version }}
        activate-environment: true
        version: 0.7.6
    - name: Install dependencies
      shell: bash
      run: |
        uv sync --all-groups
        uv pip install ollama faiss-cpu
        # always test against the latest version of the client
        # TODO: this is not necessarily a good idea. we need to test against both published and latest
        # to find out backwards compatibility issues.
        uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
        uv pip install -e .
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,23 +0,0 @@
 # GitHub Dependabot configuration
 version: 2
 updates:
  # Enable version updates for GitHub Actions
  - package-ecosystem: "github-actions"
    directory: "/" # Will use the default workflow location of `.github/workflows`
    schedule:
      interval: "weekly"
      day: "saturday"
    commit-message:
      prefix: chore(github-deps)
  - package-ecosystem: "uv"
    directory: "/"
    schedule:
      interval: "weekly"
      day: "saturday"
    # ignore all non-security updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#open-pull-requests-limit
    open-pull-requests-limit: 0
    labels:
      - type/dependencies
      - python
    commit-message:
      prefix: chore(python-deps)
--- a/.github/workflows/changelog.yml
+++ b/.github/workflows/changelog.yml
@ -1,29 +0,0 @@
 name: Update Changelog
 on:
  release:
    types: [published, unpublished, created, edited, deleted, released]
 permissions:
  contents: read
 jobs:
  generate_changelog:
    name: Generate changelog
    permissions:
      contents: write  # for peter-evans/create-pull-request to create branch
      pull-requests: write  # for peter-evans/create-pull-request to create a PR
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: main
          fetch-depth: 0
      - run: |
          python ./scripts/gen-changelog.py
      - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
        with:
          title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
          commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
          branch: create-pull-request/changelog
          signoff: true
--- a/.github/workflows/gha_workflow_llama_stack_tests.yml
+++ b/.github/workflows/gha_workflow_llama_stack_tests.yml
@ -1,355 +0,0 @@
 name: "Run Llama-stack Tests"
 on:
  #### Temporarily disable PR runs until tests run as intended within mainline.
  #TODO Add this back.
  #pull_request_target:
  #  types: ["opened"]
  #  branches:
  #    - 'main'
  #  paths:
  #    - 'llama_stack/**/*.py'
  #    - 'tests/**/*.py'
  workflow_dispatch:
    inputs:
      runner:
        description: 'GHA Runner Scale Set label to run workflow on.'
        required: true
        default: "llama-stack-gha-runner-gpu"
      checkout_reference:
        description: "The branch, tag, or SHA to checkout"
        required: true
        default: "main"
      debug:
        description: 'Run debugging steps?'
        required: false
        default: "true"
      sleep_time:
        description: '[DEBUG] sleep time for debugging'
        required: true
        default: "0"
      provider_id:
        description: 'ID of your provider'
        required: true
        default: "meta_reference"
      model_id:
        description: 'Shorthand name for target model ID (llama_3b or llama_8b)'
        required: true
        default: "llama_3b"
      model_override_3b:
        description: 'Specify shorthand model for <llama_3b> '
        required: false
        default: "Llama3.2-3B-Instruct"
      model_override_8b:
        description: 'Specify shorthand model for <llama_8b> '
        required: false
        default: "Llama3.1-8B-Instruct"
 env:
  # ID used for each test's provider config
  PROVIDER_ID: "${{ inputs.provider_id || 'meta_reference' }}"
  # Path to model checkpoints within EFS volume
  MODEL_CHECKPOINT_DIR: "/data/llama"
  # Path to directory to run tests from
  TESTS_PATH: "${{ github.workspace }}/llama_stack/providers/tests"
  # Keep track of a list of model IDs that are valid to use within pytest fixture marks
  AVAILABLE_MODEL_IDs: "llama_3b llama_8b"
  # Shorthand name for model ID, used in pytest fixture marks
  MODEL_ID: "${{ inputs.model_id || 'llama_3b' }}"
  # Override the `llama_3b` / `llama_8b' models, else use the default.
  LLAMA_3B_OVERRIDE: "${{ inputs.model_override_3b || 'Llama3.2-3B-Instruct' }}"
  LLAMA_8B_OVERRIDE: "${{ inputs.model_override_8b || 'Llama3.1-8B-Instruct' }}"
  # Defines which directories in TESTS_PATH to exclude from the test loop
  EXCLUDED_DIRS: "__pycache__"
  # Defines the output xml reports generated after a test is run
  REPORTS_GEN: ""
 jobs:
  execute_workflow:
    name: Execute workload on Self-Hosted GPU k8s runner
    permissions:
      pull-requests: write
    defaults:
      run:
        shell: bash
    runs-on: ${{ inputs.runner != '' && inputs.runner || 'llama-stack-gha-runner-gpu' }}
    if: always()
    steps:
      ##############################
      #### INITIAL DEBUG CHECKS ####
      ##############################
      - name: "[DEBUG] Check content of the EFS mount"
        id: debug_efs_volume
        continue-on-error: true
        if: inputs.debug == 'true'
        run: |
            echo "========= Content of the EFS mount ============="
            ls -la ${{ env.MODEL_CHECKPOINT_DIR }}
      - name: "[DEBUG] Get runner container OS information"
        id: debug_os_info
        if: ${{ inputs.debug == 'true' }}
        run: |
            cat /etc/os-release
      - name: "[DEBUG] Print environment variables"
        id: debug_env_vars
        if: ${{ inputs.debug == 'true' }}
        run: |
            echo "PROVIDER_ID = ${PROVIDER_ID}"
            echo "MODEL_CHECKPOINT_DIR = ${MODEL_CHECKPOINT_DIR}"
            echo "AVAILABLE_MODEL_IDs = ${AVAILABLE_MODEL_IDs}"
            echo "MODEL_ID = ${MODEL_ID}"
            echo "LLAMA_3B_OVERRIDE = ${LLAMA_3B_OVERRIDE}"
            echo "LLAMA_8B_OVERRIDE = ${LLAMA_8B_OVERRIDE}"
            echo "EXCLUDED_DIRS = ${EXCLUDED_DIRS}"
            echo "REPORTS_GEN = ${REPORTS_GEN}"
      ############################
      #### MODEL INPUT CHECKS ####
      ############################
      - name: "Check if env.model_id is valid"
        id: check_model_id
        run: |
          if [[ " ${AVAILABLE_MODEL_IDs[@]} " =~ " ${MODEL_ID} " ]]; then
            echo "Model ID '${MODEL_ID}' is valid."
          else
            echo "Model ID '${MODEL_ID}' is invalid. Terminating workflow."
            exit 1
          fi
      #######################
      #### CODE CHECKOUT ####
      #######################
      - name: "Checkout 'meta-llama/llama-stack' repository"
        id: checkout_repo
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ inputs.branch }}
      - name: "[DEBUG] Content of the repository after checkout"
        id: debug_content_after_checkout
        if: ${{ inputs.debug == 'true' }}
        run: |
            ls -la ${GITHUB_WORKSPACE}
      ##########################################################
      ####              OPTIONAL SLEEP DEBUG                ####
      #                                                        #
      # Use to "exec" into the test k8s POD and run tests      #
      # manually to identify what dependencies are being used. #
      #                                                        #
      ##########################################################
      - name: "[DEBUG] sleep"
        id: debug_sleep
        if: ${{ inputs.debug == 'true' && inputs.sleep_time != '' }}
        run: |
            sleep ${{ inputs.sleep_time }}
      ############################
      #### UPDATE SYSTEM PATH ####
      ############################
      - name: "Update path: execute"
        id: path_update_exec
        run: |
          # .local/bin is needed for certain libraries installed below to be recognized
          # when calling their executable to install sub-dependencies
          mkdir -p ${HOME}/.local/bin
          echo "${HOME}/.local/bin" >> "$GITHUB_PATH"
      #####################################
      #### UPDATE CHECKPOINT DIRECTORY ####
      #####################################
      - name: "Update checkpoint directory"
        id: checkpoint_update
        run: |
          echo "Checkpoint directory: ${MODEL_CHECKPOINT_DIR}/$LLAMA_3B_OVERRIDE"
          if [ "${MODEL_ID}" = "llama_3b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" ]; then
            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" >> "$GITHUB_ENV"
          elif [ "${MODEL_ID}" = "llama_8b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" ]; then
            echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" >> "$GITHUB_ENV"
          else
            echo "MODEL_ID & LLAMA_*B_OVERRIDE are not a valid pairing. Terminating workflow."
            exit 1
          fi
      - name: "[DEBUG] Checkpoint update check"
        id: debug_checkpoint_update
        if: ${{ inputs.debug == 'true' }}
        run: |
          echo "MODEL_CHECKPOINT_DIR (after update) = ${MODEL_CHECKPOINT_DIR}"
      ##################################
      #### DEPENDENCY INSTALLATIONS ####
      ##################################
      - name: "Installing 'apt' required packages"
        id: install_apt
        run: |
          echo "[STEP] Installing 'apt' required packages"
          sudo apt update -y
          sudo apt install -y python3 python3-pip npm wget
      - name: "Installing packages with 'curl'"
        id: install_curl
        run: |
          curl -fsSL https://ollama.com/install.sh | sh
      - name: "Installing packages with 'wget'"
        id: install_wget
        run: |
          wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
          chmod +x Miniconda3-latest-Linux-x86_64.sh
          ./Miniconda3-latest-Linux-x86_64.sh -b install -c pytorch -c nvidia faiss-gpu=1.9.0
          # Add miniconda3 bin to system path
          echo "${HOME}/miniconda3/bin" >> "$GITHUB_PATH"
      - name: "Installing packages with 'npm'"
        id: install_npm_generic
        run: |
          sudo npm install -g junit-merge
      - name: "Installing pip dependencies"
        id: install_pip_generic
        run: |
          echo "[STEP] Installing 'llama-stack' models"
          pip install -U pip setuptools
          pip install -r requirements.txt
          pip install -e .
          pip install -U \
            torch torchvision \
            pytest pytest_asyncio \
            fairscale lm-format-enforcer \
            zmq chardet pypdf \
            pandas sentence_transformers together \
            aiosqlite
      - name: "Installing packages with conda"
        id: install_conda_generic
        run: |
          conda install -q -c pytorch -c nvidia faiss-gpu=1.9.0
      #############################################################
      #### TESTING TO BE DONE FOR BOTH PRS AND MANUAL DISPATCH ####
      #############################################################
      - name: "Run Tests: Loop"
        id: run_tests_loop
        working-directory: "${{ github.workspace }}"
        run: |
          pattern=""
          for dir in llama_stack/providers/tests/*; do
            if [ -d "$dir" ]; then
              dir_name=$(basename "$dir")
              if [[ ! " $EXCLUDED_DIRS " =~ " $dir_name " ]]; then
                for file in "$dir"/test_*.py; do
                  test_name=$(basename "$file")
                  new_file="result-${dir_name}-${test_name}.xml"
                  if torchrun $(which pytest) -s -v ${TESTS_PATH}/${dir_name}/${test_name} -m "${PROVIDER_ID} and ${MODEL_ID}" \
                     --junitxml="${{ github.workspace }}/${new_file}"; then
                    echo "Ran test: ${test_name}"
                  else
                    echo "Did NOT run test: ${test_name}"
                  fi
                  pattern+="${new_file} "
                done
              fi
            fi
          done
          echo "REPORTS_GEN=$pattern" >> "$GITHUB_ENV"
      - name: "Test Summary: Merge"
        id: test_summary_merge
        working-directory: "${{ github.workspace }}"
        run: |
          echo "Merging the following test result files: ${REPORTS_GEN}"
          # Defaults to merging them into 'merged-test-results.xml'
          junit-merge ${{ env.REPORTS_GEN }}
      ############################################
      #### AUTOMATIC TESTING ON PULL REQUESTS ####
      ############################################
      #### Run tests ####
      - name: "PR - Run Tests"
        id: pr_run_tests
        working-directory: "${{ github.workspace }}"
        if: github.event_name == 'pull_request_target'
        run: |
          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${GITHUB_WORKSPACE} | path: ${{ github.workspace }}"
          # (Optional) Add more tests here.
          # Merge test results with 'merged-test-results.xml' from above.
          # junit-merge <new-test-results> merged-test-results.xml
      #### Create test summary ####
      - name: "PR - Test Summary"
        id: pr_test_summary_create
        if: github.event_name == 'pull_request_target'
        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
          output: test-summary.md
      - name: "PR - Upload Test Summary"
        id: pr_test_summary_upload
        if: github.event_name == 'pull_request_target'
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: test-summary
          path: test-summary.md
      #### Update PR request ####
      - name: "PR - Update comment"
        id: pr_update_comment
        if: github.event_name == 'pull_request_target'
        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
        with:
          filePath: test-summary.md
      ########################
      #### MANUAL TESTING ####
      ########################
      #### Run tests ####
      - name: "Manual - Run Tests: Prep"
        id: manual_run_tests
        working-directory: "${{ github.workspace }}"
        if: github.event_name == 'workflow_dispatch'
        run: |
          echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${{ github.workspace }}"
          #TODO Use this when collection errors are resolved
          # pytest -s -v -m "${PROVIDER_ID} and ${MODEL_ID}" --junitxml="${{ github.workspace }}/merged-test-results.xml"
          # (Optional) Add more tests here.
          # Merge test results with 'merged-test-results.xml' from above.
          # junit-merge <new-test-results> merged-test-results.xml
      #### Create test summary ####
      - name: "Manual - Test Summary"
        id: manual_test_summary
        if: always() && github.event_name == 'workflow_dispatch'
        uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
        with:
          paths: "${{ github.workspace }}/merged-test-results.xml"
--- a/.github/workflows/install-script-ci.yml
+++ b/.github/workflows/install-script-ci.yml
@ -1,26 +0,0 @@
 name: Installer CI
 on:
  pull_request:
    paths:
      - 'install.sh'
  push:
    paths:
      - 'install.sh'
  schedule:
    - cron: '0 2 * * *'  # every day at 02:00 UTC
 jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run ShellCheck on install.sh
        run: shellcheck install.sh
  smoke-test:
    needs: lint
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
      - name: Run installer end-to-end
        run: ./install.sh
--- a/.github/workflows/integration-auth-tests.yml
+++ b/.github/workflows/integration-auth-tests.yml
@ -1,132 +0,0 @@
 name: Integration Auth Tests
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'distributions/**'
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-auth-tests.yml' # This workflow
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test-matrix:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        auth-provider: [oauth2_token]
      fail-fast: false # we want to run all tests regardless of failure
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build Llama Stack
        run: |
          llama stack build --template ollama --image-type venv
      - name: Install minikube
        if: ${{ matrix.auth-provider == 'kubernetes' }}
        uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
      - name: Start minikube
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          minikube start
          kubectl get pods -A
      - name: Configure Kube Auth
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          kubectl create namespace llama-stack
          kubectl create serviceaccount llama-stack-auth -n llama-stack
          kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
          kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
          cat <<EOF | kubectl apply -f -
          apiVersion: rbac.authorization.k8s.io/v1
          kind: ClusterRole
          metadata:
            name: allow-anonymous-openid
          rules:
          - nonResourceURLs: ["/openid/v1/jwks"]
            verbs: ["get"]
          ---
          apiVersion: rbac.authorization.k8s.io/v1
          kind: ClusterRoleBinding
          metadata:
            name: allow-anonymous-openid
          roleRef:
            apiGroup: rbac.authorization.k8s.io
            kind: ClusterRole
            name: allow-anonymous-openid
          subjects:
          - kind: User
            name: system:anonymous
            apiGroup: rbac.authorization.k8s.io
          EOF
      - name: Set Kubernetes Config
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
          echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
          echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
          echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
      - name: Set Kube Auth Config and run server
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        if: ${{ matrix.auth-provider == 'oauth2_token' }}
        run: |
          run_dir=$(mktemp -d)
          cat <<'EOF' > $run_dir/run.yaml
          version: '2'
          image_name: kube
          apis: []
          providers: {}
          server:
            port: 8321
          EOF
          yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
          yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
          yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
          cat $run_dir/run.yaml
          nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
            if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
                echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
                exit 0
              else
                echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
                cat server.log
                exit 1
              fi
            fi
            sleep 1
          done
          echo "Llama Stack server failed to start"
          cat server.log
          exit 1
      - name: Test auth
        run: |
          curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@ -1,120 +0,0 @@
 name: Integration Tests
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/integration-tests.yml' # This workflow
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test-matrix:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        # Listing tests manually since some of them currently fail
        # TODO: generate matrix list from tests/integration when fixed
        test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
        client-type: [library, http]
        python-version: ["3.10", "3.11", "3.12"]
      fail-fast: false # we want to run all tests regardless of failure
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
        with:
          python-version: ${{ matrix.python-version }}
      - name: Setup ollama
        uses: ./.github/actions/setup-ollama
      - name: Build Llama Stack
        run: |
          uv run llama stack build --template ollama --image-type venv
      - name: Start Llama Stack server in background
        if: matrix.client-type == 'http'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" &
      - name: Wait for Llama Stack server to be ready
        if: matrix.client-type == 'http'
        run: |
          echo "Waiting for Llama Stack server..."
          for i in {1..30}; do
            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
              echo "Llama Stack server is up!"
              exit 0
            fi
            sleep 1
          done
          echo "Llama Stack server failed to start"
          cat server.log
          exit 1
      - name: Verify Ollama status is OK
        if: matrix.client-type == 'http'
        run: |
          echo "Verifying Ollama status..."
          ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
          echo "Ollama status: $ollama_status"
          if [ "$ollama_status" != "OK" ]; then
            echo "Ollama health check failed"
            exit 1
          fi
      - name: Check Storage and Memory Available Before Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Run Integration Tests
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
          OLLAMA_URL: "http://0.0.0.0:11434"
        run: |
          if [ "${{ matrix.client-type }}" == "library" ]; then
            stack_config="ollama"
          else
            stack_config="http://localhost:8321"
          fi
          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
            --text-model="meta-llama/Llama-3.2-3B-Instruct" \
            --embedding-model=all-MiniLM-L6-v2
      - name: Check Storage and Memory Available After Tests
        if: ${{ always() }}
        run: |
          free -h
          df -h
      - name: Write ollama logs to file
        if: ${{ always() }}
        run: |
          sudo docker logs ollama > ollama.log
      - name: Upload all logs to artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
          path: |
            *.log
          retention-days: 1
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -1,45 +0,0 @@
 name: Pre-commit
 on:
  pull_request:
  push:
    branches: [main]
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: |
            **/requirements*.txt
            .pre-commit-config.yaml
      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
        env:
          SKIP: no-commit-to-branch
          RUFF_OUTPUT_FORMAT: github
      - name: Verify if there are any diff files after pre-commit
        run: |
          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
      - name: Verify if there are any new files after pre-commit
        run: |
          unstaged_files=$(git ls-files --others --exclude-standard)
          if [ -n "$unstaged_files" ]; then
            echo "There are uncommitted new files, run pre-commit locally and commit again"
            echo "$unstaged_files"
            exit 1
          fi
--- a/.github/workflows/providers-build.yml
+++ b/.github/workflows/providers-build.yml
@ -1,149 +0,0 @@
 name: Test Llama Stack Build
 on:
  push:
    branches:
      - main
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/templates/**'
  pull_request:
    paths:
      - 'llama_stack/cli/stack/build.py'
      - 'llama_stack/cli/stack/_build.py'
      - 'llama_stack/distribution/build.*'
      - 'llama_stack/distribution/*.sh'
      - '.github/workflows/providers-build.yml'
      - 'llama_stack/templates/**'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      templates: ${{ steps.set-matrix.outputs.templates }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Generate Template List
        id: set-matrix
        run: |
          templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
          echo "templates=$templates" >> "$GITHUB_OUTPUT"
  build:
    needs: generate-matrix
    runs-on: ubuntu-latest
    strategy:
      matrix:
        template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
        image-type: [venv, container]
      fail-fast: false # We want to run all jobs even if some fail
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Print build dependencies
        run: |
          uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
      - name: Run Llama Stack Build
        run: |
          # USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
          # LLAMA_STACK_DIR is set to the current directory so we are building from the source
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
      - name: Print dependencies in the image
        if: matrix.image-type == 'venv'
        run: |
          uv pip list
  build-single-provider:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build a single provider
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
  build-custom-container-distribution:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build a single provider
        run: |
          yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
          yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
      - name: Inspect the container image entrypoint
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
  build-ubi9-container-distribution:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Pin template to UBI9 base
        run: |
          yq -i '
            .image_type    = "container" |
            .image_name    = "ubi9-test" |
            .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
          ' llama_stack/templates/starter/build.yaml
      - name: Build dev container (UBI9)
        env:
          USE_COPY_NOT_MOUNT: "true"
          LLAMA_STACK_DIR: "."
        run: |
          uv run llama stack build --config llama_stack/templates/starter/build.yaml
      - name: Inspect UBI9 image
        run: |
          IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
          entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
          echo "Entrypoint: $entrypoint"
          if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
            echo "Entrypoint is not correct"
            exit 1
          fi
          echo "Checking /etc/os-release in $IMAGE_ID"
          docker run --rm --entrypoint sh "$IMAGE_ID" -c \
              'source /etc/os-release && echo "$ID"' \
              | grep -qE '^(rhel|ubi)$' \
              || { echo "Base image is not UBI 9!"; exit 1; }
--- a/.github/workflows/semantic-pr.yml
+++ b/.github/workflows/semantic-pr.yml
@ -1,25 +0,0 @@
 name: Check semantic PR titles
 on:
  pull_request_target:
    types:
      - opened
      - edited
      - reopened
      - synchronize
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 permissions:
  contents: read
 jobs:
  title-check:
    runs-on: ubuntu-latest
    steps:
      - name: Check PR Title's semantic conformance
        uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/stale_bot.yml
+++ b/.github/workflows/stale_bot.yml
@ -1,45 +0,0 @@
 name: Close stale issues and PRs
 on:
  schedule:
    - cron: '0 0 * * *' # every day at midnight
 env:
  LC_ALL: en_US.UTF-8
 defaults:
  run:
    shell: bash
 permissions:
  contents: read
 jobs:
  stale:
    permissions:
      issues: write
      pull-requests: write
    runs-on: ubuntu-latest
    steps:
      - name: Stale Action
        uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
        with:
          stale-issue-label: 'stale'
          stale-issue-message: >
            This issue has been automatically marked as stale because it has not had activity within 60 days.
            It will be automatically closed if no further activity occurs within 30 days.
          close-issue-message: >
            This issue has been automatically closed due to inactivity.
            Please feel free to reopen if you feel it is still relevant!
          days-before-issue-stale: 60
          days-before-issue-close: 30
          stale-pr-label: 'stale'
          stale-pr-message: >
            This pull request has been automatically marked as stale because it has not had activity within 60 days.
            It will be automatically closed if no further activity occurs within 30 days.
          close-pr-message: >
            This pull request has been automatically closed due to inactivity.
            Please feel free to reopen if you intend to continue working on it!
          days-before-pr-stale: 60
          days-before-pr-close: 30
          operations-per-run: 300
--- a/.github/workflows/test-external-providers.yml
+++ b/.github/workflows/test-external-providers.yml
@ -1,71 +0,0 @@
 name: Test External Providers
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - 'tests/integration/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/test-external-providers.yml' # This workflow
 jobs:
  test-external-providers:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        image-type: [venv]
        # We don't do container yet, it's tricky to install a package from the host into the
        # container and point 'uv pip install' to the correct path...
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Apply image type to config file
        run: |
          yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
          cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
      - name: Setup directory for Ollama custom provider
        run: |
          mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
          cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
      - name: Create provider configuration
        run: |
          mkdir -p /home/runner/.llama/providers.d/remote/inference
          cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
      - name: Build distro from config file
        run: |
          USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
      - name: Start Llama Stack server in background
        if: ${{ matrix.image-type }} == 'venv'
        env:
          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
        run: |
          uv run pip list
          nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
      - name: Wait for Llama Stack server to be ready
        run: |
          for i in {1..30}; do
            if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then
              echo "Waiting for Llama Stack server to load the provider..."
              sleep 1
            else
              echo "Provider loaded"
              exit 0
            fi
          done
          echo "Provider failed to load"
          cat server.log
          exit 1
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -1,69 +0,0 @@
 name: auto-tests
 on:
  # pull_request:
  workflow_dispatch:
    inputs:
      commit_sha:
        description: 'Specific Commit SHA to trigger on'
        required: false
        default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
 jobs:
  test-llama-stack-as-library:
    runs-on: ubuntu-latest
    env:
      TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
      FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
      TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
    strategy:
      matrix:
        provider: [fireworks, together]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          ref: ${{ github.event.inputs.commit_sha }}
      - name: Echo commit SHA
        run: |
          echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
          git rev-parse HEAD
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt pytest
          pip install -e .
      - name: Build providers
        run: |
          llama stack build --template ${{ matrix.provider }} --image-type venv
      - name: Install the latest llama-stack-client & llama-models packages
        run: |
          pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
          pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
      - name: Run client-sdk test
        working-directory: "${{ github.workspace }}"
        env:
          REPORT_OUTPUT: md_report.md
        shell: bash
        run: |
          pip install --upgrade pytest-md-report
          echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
          export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
      - name: Output reports to the job summary
        if: always()
        shell: bash
        run: |
          if [ -f "$REPORT_FILE" ]; then
            echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
            echo "" >> $GITHUB_STEP_SUMMARY
            cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
            echo "" >> $GITHUB_STEP_SUMMARY
            echo "</details>" >> $GITHUB_STEP_SUMMARY
          fi
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@ -1,52 +0,0 @@
 name: Unit Tests
 on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
    paths:
      - 'llama_stack/**'
      - 'tests/unit/**'
      - 'uv.lock'
      - 'pyproject.toml'
      - 'requirements.txt'
      - '.github/workflows/unit-tests.yml' # This workflow
  workflow_dispatch:
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  unit-tests:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python:
          - "3.10"
          - "3.11"
          - "3.12"
          - "3.13"
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Run unit tests
        run: |
          PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
      - name: Upload test results
        if: always()
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: test-results-${{ matrix.python }}
          path: |
            .pytest_cache/
            pytest-report-${{ matrix.python }}.xml
            htmlcov-${{ matrix.python }}/
          retention-days: 7
--- a/.github/workflows/update-readthedocs.yml
+++ b/.github/workflows/update-readthedocs.yml
@ -1,68 +0,0 @@
 name: Update ReadTheDocs
 on:
  workflow_dispatch:
    inputs:
      branch:
        description: 'RTD version to update'
        required: false
        default: 'latest'
  push:
    branches:
      - main
    paths:
      - 'docs/**'
      - 'pyproject.toml'
      - '.github/workflows/update-readthedocs.yml'
    tags:
      - '*'
  pull_request:
    branches:
      - main
    paths:
      - 'docs/**'
      - 'pyproject.toml'
      - '.github/workflows/update-readthedocs.yml'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  update-readthedocs:
    runs-on: ubuntu-latest
    env:
      TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Install dependencies
        uses: ./.github/actions/setup-runner
      - name: Build HTML
        run: |
          cd docs
          uv run make html
      - name: Trigger ReadTheDocs build
        if: github.event_name != 'pull_request'
        run: |
          if [ -z "$TOKEN" ]; then
            echo "READTHEDOCS_TOKEN is not set"
            exit 1
          fi
          response=$(curl -X POST \
            -H "Content-Type: application/json" \
            -d "{
              \"token\": \"$TOKEN\",
              \"version\": \"$GITHUB_REF_NAME\"
            }" \
            https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
          echo "Response: $response"
          if [ $(echo $response | jq -r '.build_triggered') != 'true' ]; then
            echo "Failed to trigger ReadTheDocs build"
            exit 1
          fi
--- a/.gitignore
+++ b/.gitignore
@ -1,26 +0,0 @@
 .env
 __pycache__
 dist
 *.egg-info
 dev_requirements.txt
 build
 .DS_Store
 llama_stack/configs/*
 .cursor/
 xcuserdata/
 *.hmap
 .DS_Store
 .build/
 Package.resolved
 *.pte
 *.ipynb_checkpoints*
 .idea
 .venv/
 .vscode
 _build
 docs/src
 pyrightconfig.json
 venv/
 pytest-report.xml
 .coverage
 .python-version
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,118 +0,0 @@
 exclude: 'build/'
 default_language_version:
    python: python3
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v5.0.0  # Latest stable version
    hooks:
    -   id: check-merge-conflict
        args: ['--assume-in-merge']
    -   id: trailing-whitespace
        exclude: '\.py$'  # Exclude Python files as Ruff already handles them
    -   id: check-added-large-files
        args: ['--maxkb=1000']
    -   id: end-of-file-fixer
        exclude: '^(.*\.svg)$'
    -   id: no-commit-to-branch
    -   id: check-yaml
        args: ["--unsafe"]
    -   id: detect-private-key
    -   id: requirements-txt-fixer
    -   id: mixed-line-ending
        args: [--fix=lf] # Forces to replace line ending by LF (line feed)
    -   id: check-executables-have-shebangs
    -   id: check-json
    -   id: check-shebang-scripts-are-executable
    -   id: check-symlinks
    -   id: check-toml
 -   repo: https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.5.4
    hooks:
    -   id: insert-license
        files: \.py$|\.sh$
        args:
          - --license-filepath
          - docs/license_header.txt
 -   repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.9.4
    hooks:
    -   id: ruff
        args: [ --fix ]
        exclude: ^llama_stack/strong_typing/.*$
    -   id: ruff-format
 -   repo: https://github.com/adamchainz/blacken-docs
    rev: 1.19.0
    hooks:
    -   id: blacken-docs
        additional_dependencies:
        - black==24.3.0
 -   repo: https://github.com/astral-sh/uv-pre-commit
    rev: 0.7.8
    hooks:
    -   id: uv-lock
    -   id: uv-export
        args: [
            "--frozen",
            "--no-hashes",
            "--no-emit-project",
            "--no-default-groups",
            "--output-file=requirements.txt"
        ]
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.15.0
    hooks:
    -   id: mypy
        additional_dependencies:
          - uv==0.6.2
          - mypy
          - pytest
          - rich
          - types-requests
          - pydantic
        pass_filenames: false
 # - repo: https://github.com/tcort/markdown-link-check
 #   rev: v3.11.2
 #   hooks:
 #     - id: markdown-link-check
 #       args: ['--quiet']
 -   repo: local
    hooks:
      - id: distro-codegen
        name: Distribution Template Codegen
        additional_dependencies:
          - uv==0.7.8
        entry: uv run --group codegen ./scripts/distro_codegen.py
        language: python
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
      - id: openapi-codegen
        name: API Spec Codegen
        additional_dependencies:
          - uv==0.7.8
        entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
        language: python
        pass_filenames: false
        require_serial: true
        files: ^llama_stack/apis/|^docs/openapi_generator/
      - id: check-workflows-use-hashes
        name: Check GitHub Actions use SHA-pinned actions
        entry: ./scripts/check-workflows-use-hashes.sh
        language: system
        pass_filenames: false
        require_serial: true
        always_run: true
        files: ^\.github/workflows/.*\.ya?ml$
 ci:
    autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
    autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@ -1,25 +0,0 @@
 # .readthedocs.yaml
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 # Required
 version: 2
 # Build documentation in the "docs/" directory with Sphinx
 sphinx:
  configuration: docs/source/conf.py
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
  jobs:
    pre_create_environment:
      - asdf plugin add uv
      - asdf install uv latest
      - asdf global uv latest
    create_environment:
      - uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
    install:
      - UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,531 +0,0 @@
 # Changelog
 # v0.2.10.1
 Published on: 2025-06-06T20:11:02Z
 ## Highlights
 * ChromaDB provider fix
 ---
 # v0.2.10
 Published on: 2025-06-05T23:21:45Z
 ## Highlights
 * OpenAI-compatible embeddings API
 * OpenAI-compatible Files API
 * Postgres support in starter distro
 * Enable ingestion of precomputed embeddings
 * Full multi-turn support in Responses API
 * Fine-grained access control policy
 ---
 # v0.2.9
 Published on: 2025-05-30T20:01:56Z
 ## Highlights
 * Added initial streaming support in Responses API
 * UI view for Responses
 * Postgres inference store support
 ---
 # v0.2.8
 Published on: 2025-05-27T21:03:47Z
 # Release v0.2.8
 ## Highlights
 * Server-side MCP with auth firewalls now works in the Stack - both for Agents and Responses
 * Get chat completions APIs and UI to show chat completions
 * Enable keyword search for sqlite-vec
 ---
 # v0.2.7
 Published on: 2025-05-16T20:38:10Z
 ## Highlights
 This is a small update. But a couple highlights:
 * feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
 * feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
 * feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
 ---
 # v0.2.6
 Published on: 2025-05-12T18:06:52Z
 ---
 # v0.2.5
 Published on: 2025-05-04T20:16:49Z
 ---
 # v0.2.4
 Published on: 2025-04-29T17:26:01Z
 ## Highlights
 * One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
 * support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
 * (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
 * (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
 * add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
 ---
 # v0.2.3
 Published on: 2025-04-25T22:46:21Z
 ## Highlights
 * OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
 * significant improvements and functionality added to the nVIDIA distribution
 * many improvements to the test verification suite.
 * new inference providers: Ramalama, IBM WatsonX
 * many improvements to the Playground UI
 ---
 # v0.2.2
 Published on: 2025-04-13T01:19:49Z
 ## Main changes
 - Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
 - OpenAI compatible inference API in progress (@bbrowning)
 - Provider verifications (@ehhuang)
 - Many updates and fixes to playground
 - Several llama4 related fixes
 ---
 # v0.2.1
 Published on: 2025-04-05T23:13:00Z
 ---
 # v0.2.0
 Published on: 2025-04-05T19:04:29Z
 ## Llama 4 Support
 Checkout more at https://www.llama.com
 ---
 # v0.1.9
 Published on: 2025-03-29T00:52:23Z
 ### Build and Test Agents
 * Agents: Entire document context with attachments
 * RAG: Documentation with sqlite-vec faiss comparison
 * Getting started: Fixes to getting started notebook.
 ### Agent Evals and Model Customization
 * (**New**) Post-training: Add nemo customizer
 ### Better Engineering
 * Moved sqlite-vec to non-blocking calls
 * Don't return a payload on file delete
 ---
 # v0.1.8
 Published on: 2025-03-24T01:28:50Z
 # v0.1.8 Release Notes
 ### Build and Test Agents
 * Safety: Integrated NVIDIA as a safety provider.
 * VectorDB: Added Qdrant as an inline provider.
 * Agents: Added support for multiple tool groups in agents.
 * Agents: Simplified imports for Agents in client package
 ### Agent Evals and Model Customization
 * Introduced DocVQA and IfEval benchmarks.
 ### Deploying and Monitoring Agents
 * Introduced a Containerfile and image workflow for the Playground.
 * Implemented support for Bearer (API Key) authentication.
 * Added attribute-based access control for resources.
 * Fixes on docker deployments: use --pull always and standardized the default port to 8321
 * Deprecated: /v1/inspect/providers use /v1/providers/ instead
 ### Better Engineering
 * Consolidated scripts under the ./scripts directory.
 * Addressed mypy violations in various modules.
 * Added Dependabot scans for Python dependencies.
 * Implemented a scheduled workflow to update the changelog automatically.
 * Enforced concurrency to reduce CI loads.
 ### New Contributors
 * @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
 * @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
 * @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
 * @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
 ---
 # v0.1.7
 Published on: 2025-03-14T22:30:51Z
 ## 0.1.7 Release Notes
 ###  Build and Test Agents
 * Inference: ImageType is now refactored to LlamaStackImageType
 * Inference: Added tests to measure TTFT
 * Inference: Bring back usage metrics
 * Agents: Added endpoint for get agent, list agents and list sessions
 * Agents: Automated conversion of type hints in client tool for lite llm format
 * Agents: Deprecated ToolResponseMessage in agent.resume API
 * Added Provider API for listing and inspecting provider info
 ### Agent Evals and Model Customization
 * Eval: Added new eval benchmarks Math 500 and BFCL v3
 * Deploy and Monitoring of Agents
 * Telemetry: Fix tracing to work across coroutines
 ###  Better Engineering
 * Display code coverage for unit tests
 * Updated call sites (inference, tool calls, agents) to move to async non blocking calls
 * Unit tests also run on Python 3.11, 3.12, and 3.13
 * Added ollama inference to Integration tests CI
 * Improved documentation across examples, testing, CLI, updated providers table )
 ---
 # v0.1.6
 Published on: 2025-03-08T04:35:08Z
 ## 0.1.6 Release Notes
 ### Build and Test Agents
 * Inference: Fixed support for inline vllm provider
 * (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb)
 * (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples
 * Agent: Unify tools and Python SDK Agents API
 * Agent: AsyncAgent Python SDK wrapper supporting async client tool calls
 * Agent: Support python functions without @client_tool decorator as client tools
 * Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format
 * VectorIO: MilvusDB support added
 ### Agent Evals and Model Customization
 * (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb)
 * Eval: Documentation for eval, scoring, adding new benchmarks
 * Eval: Distribution template to run benchmarks on llama & non-llama models
 * Eval: Ability to register new custom LLM-as-judge scoring functions
 * (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details.
 ### Deploy and Monitoring of Agents
 * Better support for different log levels across all components for better monitoring
 ### Better Engineering
 * Enhance OpenAPI spec to include Error types across all APIs
 * Moved all tests to /tests and created unit tests to run on each PR
 * Removed all dependencies on llama-models repo
 ---
 # v0.1.5.1
 Published on: 2025-02-28T22:37:44Z
 ## 0.1.5.1 Release Notes
 * Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1
 ---
 # v0.1.5
 Published on: 2025-02-28T18:14:01Z
 ## 0.1.5 Release Notes
 ###  Build Agents
 * Inference: Support more non-llama models (openai, anthropic, gemini)
 * Inference: Can use the provider's model name in addition to the HF alias
 * Inference: Fixed issues with calling tools that weren't specified in the prompt
 * RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
 * Embeddings: Added support for Nemo retriever embedding models
 * Tools: Added support for MCP tools in Ollama Distribution
 * Distributions: Added new Groq distribution
 ### Customize Models
 * Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
 ### Monitor agents
 * More comprehensive logging of agent steps including client tools
 * Telemetry inputs/outputs are now structured and queryable
 * Ability to retrieve agents session, turn, step by ids
 ### Better Engineering
 * Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
 * Move most logging to use logger instead of prints
 * Completed text /chat-completion and /completion tests
 ---
 # v0.1.4
 Published on: 2025-02-25T00:02:43Z
 ## v0.1.4 Release Notes
 Here are the key changes coming as part of this release:
 ### Build and Test Agents
 * Inference: Added support for non-llama models
 * Inference: Added option to list all downloaded models and remove models
 * Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
 * Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
 * Agent: Added logging for agent step start and completion times
 * Agent: Added support for logging for tool execution metadata
 * Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
 * Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
 * VectorIO: Improved performance of sqlite-vec using chunked writes
 ### Agent Evals and Model Customization
 * Deprecated api /eval-tasks. Use /eval/benchmark  instead
 * Added CPU training support for TorchTune
 ### Deploy and Monitoring of Agents
 * Consistent view of client and server tool calls in telemetry
 ### Better Engineering
 * Made tests more data-driven for consistent evaluation
 * Fixed documentation links and improved API reference generation
 * Various small fixes for build scripts and system reliability
 ---
 # v0.1.3
 Published on: 2025-02-14T20:24:32Z
 ## v0.1.3 Release
 Here are some key changes that are coming as part of this release.
 ### Build and Test Agents
 Streamlined the initial development experience
 - Added support for  llama stack run --image-type venv
 - Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
 - vLLM improvements for tool calling and logprobs
 - Better handling of sporadic code_interpreter tool calls
 ### Agent Evals
 Better benchmarking and Agent performance assessment
 - Renamed eval API /eval-task to /benchmarks
 - Improved documentation and notebooks for RAG and evals
 ### Deploy and Monitoring of Agents
 Improved production readiness
 - Added usage metrics collection for chat completions
 - CLI improvements for provider information
 - Improved error handling and system reliability
 - Better model endpoint handling and accessibility
 - Improved signal handling on distro server
 ### Better Engineering
 Infrastructure and code quality improvements
 - Faster text-based chat completion tests
 - Improved testing for non-streaming agent apis
 - Standardized import formatting with ruff linter
 - Added conventional commits standard
 - Fixed documentation parsing issues
 ---
 # v0.1.2
 Published on: 2025-02-07T22:06:49Z
 # TL;DR
 - Several stabilizations to development flows after the switch to `uv`
 - Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
 - Added automated rebuilds for ReadTheDocs
 - Llama Stack server supports HTTPS
 - Added system prompt overrides support
 - Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
 ---
 # v0.1.1
 Published on: 2025-02-02T02:29:24Z
 A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
 ---
 # v0.1.0
 Published on: 2025-01-24T17:47:47Z
 We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
 ## Context
 GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
 Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
 With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
 ## Release
 After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
 There are example standalone apps in llama-stack-apps.
 ## Key Features of this release
 - **Unified API Layer**
  - Inference: Run LLM models
  - RAG: Store and retrieve knowledge for RAG
  - Agents: Build multi-step agentic workflows
  - Tools: Register tools that can be called by the agent
  - Safety: Apply content filtering and safety policies
  - Evaluation: Test model and agent quality
  - Telemetry: Collect and analyze usage data and complex agentic traces
  - Post Training ( Coming Soon ): Fine tune models for specific use cases
 - **Rich Provider Ecosystem**
  - Local Development: Meta's Reference, Ollama
  - Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
  - On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
  - On-device: iOS and Android support
 - **Built for Production**
  - Pre-packaged distributions for common deployment scenarios
  - Backwards compatibility across model versions
  - Comprehensive evaluation capabilities
  - Full observability and monitoring
 - **Multiple developer interfaces**
  - CLI: Command line interface
  - Python SDK
  - Swift iOS SDK
  - Kotlin Android SDK
 - **Sample llama stack applications**
  - Python
  - iOS
  - Android
 ---
 # v0.1.0rc12
 Published on: 2025-01-22T22:24:01Z
 ---
 # v0.0.63
 Published on: 2024-12-18T07:17:43Z
 A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
 **Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63
 ---
 # v0.0.62
 Published on: 2024-12-18T02:39:43Z
 ---
 # v0.0.61
 Published on: 2024-12-10T20:50:33Z
 ---
 # v0.0.55
 Published on: 2024-11-23T17:14:07Z
 ---
 # v0.0.54
 Published on: 2024-11-22T00:36:09Z
 ---
 # v0.0.53
 Published on: 2024-11-20T22:18:00Z
 🚀  Initial Release Notes for Llama Stack!
 ### Added
 - Resource-oriented design for models, shields, memory banks, datasets and eval tasks
 - Persistence for registered objects with distribution
 - Ability to persist memory banks created for FAISS
 - PostgreSQL KVStore implementation
 - Environment variable placeholder support in run.yaml files
 - Comprehensive Zero-to-Hero notebooks and quickstart guides
 - Support for quantized models in Ollama
 - Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
 - Bedrock distribution with safety shields support
 - Evals API with task registration and scoring functions
 - MMLU and SimpleQA benchmark scoring functions
 - Huggingface dataset provider integration for benchmarks
 - Support for custom dataset registration from local paths
 - Benchmark evaluation CLI tools with visualization tables
 - RAG evaluation scoring functions and metrics
 - Local persistence for datasets and eval tasks
 ### Changed
 - Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
 - Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
 - Updated API signatures for dataset and eval task registration
 - Restructured folder organization for providers
 - Enhanced Docker build configuration
 - Added version prefixing for REST API routes
 - Enhanced evaluation task registration workflow
 - Improved benchmark evaluation output formatting
 - Restructured evals folder organization for better modularity
 ### Removed
 - `llama stack configure` command
 ---
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -1,80 +0,0 @@
 # Code of Conduct
 ## Our Pledge
 In the interest of fostering an open and welcoming environment, we as
 contributors and maintainers pledge to make participation in our project and
 our community a harassment-free experience for everyone, regardless of age, body
 size, disability, ethnicity, sex characteristics, gender identity and expression,
 level of experience, education, socio-economic status, nationality, personal
 appearance, race, religion, or sexual identity and orientation.
 ## Our Standards
 Examples of behavior that contributes to creating a positive environment
 include:
 * Using welcoming and inclusive language
 * Being respectful of differing viewpoints and experiences
 * Gracefully accepting constructive criticism
 * Focusing on what is best for the community
 * Showing empathy towards other community members
 Examples of unacceptable behavior by participants include:
 * The use of sexualized language or imagery and unwelcome sexual attention or
 advances
 * Trolling, insulting/derogatory comments, and personal or political attacks
 * Public or private harassment
 * Publishing others' private information, such as a physical or electronic
 address, without explicit permission
 * Other conduct which could reasonably be considered inappropriate in a
 professional setting
 ## Our Responsibilities
 Project maintainers are responsible for clarifying the standards of acceptable
 behavior and are expected to take appropriate and fair corrective action in
 response to any instances of unacceptable behavior.
 Project maintainers have the right and responsibility to remove, edit, or
 reject comments, commits, code, wiki edits, issues, and other contributions
 that are not aligned to this Code of Conduct, or to ban temporarily or
 permanently any contributor for other behaviors that they deem inappropriate,
 threatening, offensive, or harmful.
 ## Scope
 This Code of Conduct applies within all project spaces, and it also applies when
 an individual is representing the project or its community in public spaces.
 Examples of representing a project or community include using an official
 project e-mail address, posting via an official social media account, or acting
 as an appointed representative at an online or offline event. Representation of
 a project may be further defined and clarified by project maintainers.
 This Code of Conduct also applies outside the project spaces when there is a
 reasonable belief that an individual's behavior may have a negative impact on
 the project or its community.
 ## Enforcement
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported by contacting the project team at <opensource-conduct@meta.com>. All
 complaints will be reviewed and investigated and will result in a response that
 is deemed necessary and appropriate to the circumstances. The project team is
 obligated to maintain confidentiality with regard to the reporter of an incident.
 Further details of specific enforcement policies may be posted separately.
 Project maintainers who do not follow or enforce the Code of Conduct in good
 faith may face temporary or permanent repercussions as determined by other
 members of the project's leadership.
 ## Attribution
 This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
 available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
 [homepage]: https://www.contributor-covenant.org
 For answers to common questions about this code of conduct, see
 https://www.contributor-covenant.org/faq
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,189 +0,0 @@
 # Contributing to Llama-Stack
 We want to make contributing to this project as easy and transparent as
 possible.
 ## Discussions -> Issues -> Pull Requests
 We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
 If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
 **I'd like to contribute!**
 All issues are actionable (please report if they are not.) Pick one and start working on it. Thank you.
 If you need help or guidance, comment on the issue. Issues that are extra friendly to new contributors are tagged with "contributor friendly".
 **I have a bug!**
 1. Search the issue tracker and discussions for similar issues.
 2. If you don't have steps to reproduce, open a discussion.
 3. If you have steps to reproduce, open an issue.
 **I have an idea for a feature!**
 1. Open a discussion.
 **I've implemented a feature!**
 1. If there is an issue for the feature, open a pull request.
 2. If there is no issue, open a discussion and link to your branch.
 **I have a question!**
 1. Open a discussion or use [Discord](https://discord.gg/llama-stack).
 **Opening a Pull Request**
 1. Fork the repo and create your branch from `main`.
 2. If you've changed APIs, update the documentation.
 3. Ensure the test suite passes.
 4. Make sure your code lints using `pre-commit`.
 5. If you haven't already, complete the Contributor License Agreement ("CLA").
 6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
 ## Contributor License Agreement ("CLA")
 In order to accept your pull request, we need you to submit a CLA. You only need
 to do this once to work on any of Meta's open source projects.
 Complete your CLA here: <https://code.facebook.com/cla>
 ## Issues
 We use GitHub issues to track public bugs. Please ensure your description is
 clear and has sufficient instructions to be able to reproduce the issue.
 Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
 disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.
 ## Set up your development environment
 We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
 You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
 You can install the dependencies by running:
 ```bash
 cd llama-stack
 uv sync --extra dev
 uv pip install -e .
 source .venv/bin/activate
 ```
 > [!NOTE]
 > You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.11`)
 > Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
 > For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
 Note that you can create a dotenv file `.env` that includes necessary environment variables:
 ```
 LLAMA_STACK_BASE_URL=http://localhost:8321
 LLAMA_STACK_CLIENT_LOG=debug
 LLAMA_STACK_PORT=8321
 LLAMA_STACK_CONFIG=<provider-name>
 TAVILY_SEARCH_API_KEY=
 BRAVE_SEARCH_API_KEY=
 ```
 And then use this dotenv file when running client SDK tests via the following:
 ```bash
 uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
 ```
 ## Pre-commit Hooks
 We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
 ```bash
 uv run pre-commit install
 ```
 After that, pre-commit hooks will run automatically before each commit.
 Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
 ```bash
 uv run pre-commit run --all-files
 ```
 > [!CAUTION]
 > Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
 ## Running tests
 You can find the Llama Stack testing documentation here [here](tests/README.md).
 ## Adding a new dependency to the project
 To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
 ```bash
 uv add foo
 uv sync
 ```
 ## Coding Style
 * Comments should provide meaningful insights into the code. Avoid filler comments that simply
  describe the next step, as they create unnecessary clutter, same goes for docstrings.
 * Prefer comments to clarify surprising behavior and/or relationships between parts of the code
  rather than explain what the next line of code does.
 * Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
  `Exception`.
 * Error messages should be prefixed with "Failed to ..."
 * 4 spaces for indentation rather than tab
 * When using `# noqa` to suppress a style or linter warning, include a comment explaining the
  justification for bypassing the check.
 * When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
  justification for bypassing the check.
 * Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
  readability reasons.
 ## Common Tasks
 Some tips about common tasks you work on while contributing to Llama Stack:
 ### Using `llama stack build`
 Building a stack image (conda / docker) will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
 Example:
 ```bash
 cd work/
 git clone https://github.com/meta-llama/llama-stack.git
 git clone https://github.com/meta-llama/llama-stack-client-python.git
 cd llama-stack
 LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --template <...>
 ```
 ### Updating Provider Configurations
 If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.
 ### Building the Documentation
 If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
 ```bash
 # This rebuilds the documentation pages.
 uv run --group docs make -C docs/ html
 # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
 uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
 ```
 ### Update API Documentation
 If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
 ```bash
 uv run ./docs/openapi_generator/run_openapi_generator.sh
 ```
 The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
 ## License
 By contributing to Llama, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.
--- a/22
+++ b/22
@ -1,22 +0,0 @@
 MIT License
 Copyright (c) Meta Platforms, Inc. and affiliates
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
 "Software"), to deal in the Software without restriction, including
 without limitation the rights to use, copy, modify, merge, publish,
 distribute, sublicense, and/or sell copies of the Software, and to
 permit persons to whom the Software is furnished to do so, subject to
 the following conditions:
 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,9 +0,0 @@
 include pyproject.toml
 include llama_stack/models/llama/llama3/tokenizer.model
 include llama_stack/models/llama/llama4/tokenizer.model
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/templates/*/*.yaml
 include llama_stack/providers/tests/test_cases/inference/*.json
 include llama_stack/models/llama/*/*.md
 include llama_stack/tests/integration/*.jpg
--- a/README.md
+++ b/README.md
@ -1,177 +0,0 @@
 # Llama Stack
 [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
 [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
 [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
 [![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
 [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
 [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
 [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
 ### ✨🎉 Llama 4 Support  🎉✨
 We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
 <details>
 <summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
 \
 *Note you need 8xH100 GPU-host to run these models*
 ```bash
 pip install -U llama_stack
 MODEL="Llama-4-Scout-17B-16E-Instruct"
 # get meta url from llama.com
 llama model download --source meta --model-id $MODEL --meta-url <META_URL>
 # start a llama stack server
 INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
 # install client to interact with the server
 pip install llama-stack-client
 ```
 ### CLI
 ```bash
 # Run a chat completion
 llama-stack-client --endpoint http://localhost:8321 \
 inference chat-completion \
 --model-id meta-llama/$MODEL \
 --message "write a haiku for meta's llama 4 models"
 ChatCompletionResponse(
    completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
    logprobs=None,
    metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
 )
 ```
 ### Python SDK
 ```python
 from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(base_url=f"http://localhost:8321")
 model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 prompt = "Write a haiku about coding"
 print(f"User> {prompt}")
 response = client.inference.chat_completion(
    model_id=model_id,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ],
 )
 print(f"Assistant> {response.completion_message.content}")
 ```
 As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
 </details>
 ### 🚀 One-Line Installer 🚀
 To try Llama Stack locally, run:
 ```bash
 curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | sh
 ```
 ### Overview
 Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
 - **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
 - **Plugin architecture** to support the rich ecosystem of different API implementations in various environments, including local development, on-premises, cloud, and mobile.
 - **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment.
 - **Multiple developer interfaces** like CLI and SDKs for Python, Typescript, iOS, and Android.
 - **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack.
 <div style="text-align: center;">
  <img
    src="https://github.com/user-attachments/assets/33d9576d-95ea-468d-95e2-8fa233205a50"
    width="480"
    title="Llama Stack"
    alt="Llama Stack"
  />
 </div>
 ### Llama Stack Benefits
 - **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choices.
 - **Consistent Experience**: With its unified APIs, Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
 - **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models.
 By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.
 ### API Providers
 Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
 | **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** |
 |:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:|
 |      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |               |
 |        SambaNova         |         Hosted         |            |       ✅       |            |     ✅      |               |                  |
 |         Cerebras         |         Hosted         |            |       ✅       |            |            |               |                  |
 |        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |                |
 |       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |                |
 |         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |                |
 |           Groq           |         Hosted         |            |       ✅       |            |            |               |                 |
 |          Ollama          |      Single Node       |            |       ✅       |            |            |               |                 |
 |           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |                 |
 |        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |                 |
 |          Chroma          |      Single Node       |            |               |     ✅      |            |               |                 |
 |        PG Vector         |      Single Node       |            |               |     ✅      |            |               |                 |
 |    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |                |
 |           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |                 |
 |          OpenAI          |         Hosted         |            |       ✅       |            |            |               |                 |
 |        Anthropic         |         Hosted         |            |       ✅       |            |            |               |                 |
 |          Gemini          |         Hosted         |            |       ✅       |            |            |               |                 |
 |          watsonx         |         Hosted         |            |       ✅       |            |            |               |                 |
 |        HuggingFace       |       Single Node      |            |                |            |            |               |       ✅        |
 |         TorchTune        |       Single Node      |            |                |            |            |               |       ✅        |
 |       NVIDIA NEMO        |         Hosted         |            |                |            |            |               |       ✅        |
 ### Distributions
 A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider implementations for each API component. Distributions make it easy to get started with a specific deployment scenario - you can begin with a local development setup (eg. ollama) and seamlessly transition to production (eg. Fireworks) without changing your application code. Here are some of the distributions we support:
 |               **Distribution**                |                                                                    **Llama Stack Docker**                                                                     |                                                 Start This Distribution                                                  |
 |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
 |                Meta Reference                 |           [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general)           |      [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html)      |
 |                   SambaNova                   |                     [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html)   |
 |                   Cerebras                    |                     [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general)                     |   [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html)   |
 |                    Ollama                     |                       [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general)                       |            [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html)            |
 |                      TGI                      |                          [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general)                          |             [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html)              |
 |                   Together                    |                     [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general)                     |           [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html)           |
 |                   Fireworks                   |                    [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general)                    |          [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html)           |
 | vLLM |                  [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general)                  |         [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html)          |
 ### Documentation
 Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
 * CLI references
    * [llama (server-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
    * [llama (client-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
 * Getting Started
    * [Quick guide to start a Llama Stack server](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
    * [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
    * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
    * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
 * [Contributing](CONTRIBUTING.md)
    * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.
 ### Llama Stack Client SDKs
 |  **Language** |  **Client SDK** | **Package** |
 | :----: | :----: | :----: |
 | Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
 | Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
 | Typescript   | [llama-stack-client-typescript](https://github.com/meta-llama/llama-stack-client-typescript) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
 | Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
 Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
 You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,5 +0,0 @@
 # Security Policy
 ## Reporting a Vulnerability
 Please report vulnerabilities to our bug bounty program at https://bugbounty.meta.com/
--- a/docs/Makefile
+++ b/docs/Makefile
@ -1,20 +0,0 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = _build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/_static/css/my_theme.css
+++ b/docs/_static/css/my_theme.css
@ -1,35 +0,0 @@
@import url("theme.css");
 .wy-nav-content {
    max-width: 90%;
 }
 .wy-nav-side {
    /* background: linear-gradient(45deg, #2980B9, #16A085); */
    background: linear-gradient(90deg, #332735, #1b263c);
 }
 .wy-side-nav-search {
    background-color: transparent !important;
 }
 .hide-title h1 {
    display: none;
 }
 h2, h3, h4 {
    font-weight: normal;
 }
 html[data-theme="dark"] .rst-content div[class^="highlight"] {
  background-color: #0b0b0b;
 }
 pre {
    white-space: pre-wrap !important;
    word-break: break-all;
 }
 [data-theme="dark"] .mermaid {
    background-color: #f4f4f6 !important;
    border-radius: 6px;
    padding: 0.5em;
  }
--- a/docs/_static/js/detect_theme.js
+++ b/docs/_static/js/detect_theme.js
@ -1,32 +0,0 @@
 document.addEventListener("DOMContentLoaded", function () {
  const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
  const htmlElement = document.documentElement;
  // Check if theme is saved in localStorage
  const savedTheme = localStorage.getItem("sphinx-rtd-theme");
  if (savedTheme) {
    // Use the saved theme preference
    htmlElement.setAttribute("data-theme", savedTheme);
    document.body.classList.toggle("dark", savedTheme === "dark");
  } else {
    // Fall back to system preference
    const theme = prefersDark ? "dark" : "light";
    htmlElement.setAttribute("data-theme", theme);
    document.body.classList.toggle("dark", theme === "dark");
    // Save initial preference
    localStorage.setItem("sphinx-rtd-theme", theme);
  }
  // Listen for theme changes from the existing toggle
  const observer = new MutationObserver(function(mutations) {
    mutations.forEach(function(mutation) {
      if (mutation.attributeName === "data-theme") {
        const currentTheme = htmlElement.getAttribute("data-theme");
        localStorage.setItem("sphinx-rtd-theme", currentTheme);
      }
    });
  });
  observer.observe(htmlElement, { attributes: true });
 });
--- a/docs/_static/llama-stack-logo.png
+++ b/docs/_static/llama-stack-logo.png
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
--- a/docs/_static/llama-stack.png
+++ b/docs/_static/llama-stack.png
--- a/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
--- a/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
+++ b/docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
--- a/docs/_static/remote_or_local.gif
+++ b/docs/_static/remote_or_local.gif
--- a/docs/_static/safety_system.webp
+++ b/docs/_static/safety_system.webp
--- a/docs/conftest.py
+++ b/docs/conftest.py
@ -1,24 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import os
 import time
 def pytest_collection_modifyitems(items):
    for item in items:
        item.name = item.name.replace(' ', '_') 
 def pytest_runtest_teardown(item):
    interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
    if interval_seconds:
        time.sleep(float(interval_seconds))
 def pytest_configure(config):
    config.option.tbstyle = "short"
    config.option.disable_warnings = True
--- a/docs/contbuild.sh
+++ b/docs/contbuild.sh
@ -1,7 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 sphinx-autobuild --write-all source build/html --watch source/
--- a/docs/dog.jpg
+++ b/docs/dog.jpg
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
--- a/docs/getting_started_llama4.ipynb
+++ b/docs/getting_started_llama4.ipynb
--- a/docs/getting_started_llama_api.ipynb
+++ b/docs/getting_started_llama_api.ipynb
--- a/docs/license_header.txt
+++ b/docs/license_header.txt
@ -1,5 +0,0 @@
 Copyright (c) Meta Platforms, Inc. and affiliates.
 All rights reserved.
 This source code is licensed under the terms described in the LICENSE file in
 the root directory of this source tree.
--- a/docs/make.bat
+++ b/docs/make.bat
@ -1,35 +0,0 @@
@ECHO OFF
 pushd %~dp0
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set SOURCEDIR=.
 set BUILDDIR=_build
 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 	echo.installed, then set the SPHINXBUILD environment variable to point
 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 	echo.may add the Sphinx directory to PATH.
 	echo.
 	echo.If you don't have Sphinx installed, grab it from
 	echo.https://www.sphinx-doc.org/
 	exit /b 1
 )
 if "%1" == "" goto help
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end
 :help
 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 :end
 popd
--- a/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
+++ b/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb
--- a/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
+++ b/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb
--- a/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
+++ b/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
--- a/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
+++ b/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb
--- a/docs/openapi_generator/README.md
+++ b/docs/openapi_generator/README.md
@ -1 +0,0 @@
 The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@ -1,91 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described found in the
 # LICENSE file in the root directory of this source tree.
 from datetime import datetime
 from pathlib import Path
 import sys
 import fire
 import ruamel.yaml as yaml
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION  # noqa: E402
 from llama_stack.distribution.stack import LlamaStack  # noqa: E402
 from .pyopenapi.options import Options  # noqa: E402
 from .pyopenapi.specification import Info, Server  # noqa: E402
 from .pyopenapi.utility import Specification, validate_api  # noqa: E402
 def str_presenter(dumper, data):
    if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
        "#/components/schemas/"
    ):
        style = None
    else:
        style = ">" if "\n" in data or len(data) > 40 else None
    return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
 def main(output_dir: str):
    output_dir = Path(output_dir)
    if not output_dir.exists():
        raise ValueError(f"Directory {output_dir} does not exist")
    # Validate API protocols before generating spec
    return_type_errors = validate_api()
    if return_type_errors:
        print("\nAPI Method Return Type Validation Errors:\n")
        for error in return_type_errors:
            print(error, file=sys.stderr)
        sys.exit(1)
    now = str(datetime.now())
    print(
        "Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at " + now
    )
    print("")
    spec = Specification(
        LlamaStack,
        Options(
            server=Server(url="http://any-hosted-llama-stack.com"),
            info=Info(
                title="Llama Stack Specification",
                version=LLAMA_STACK_API_VERSION,
                description="""This is the specification of the Llama Stack that provides
                a set of endpoints and their corresponding interfaces that are tailored to
                best leverage Llama Models.""",
            ),
            include_standard_error_responses=True,
        ),
    )
    with open(output_dir / "llama-stack-spec.yaml", "w", encoding="utf-8") as fp:
        y = yaml.YAML()
        y.default_flow_style = False
        y.block_seq_indent = 2
        y.map_indent = 2
        y.sequence_indent = 4
        y.sequence_dash_offset = 2
        y.width = 80
        y.allow_unicode = True
        y.representer.add_representer(str, str_presenter)
        y.dump(
            spec.get_json(),
            fp,
        )
    with open(output_dir / "llama-stack-spec.html", "w") as fp:
        spec.write_html(fp, pretty_print=True)
 if __name__ == "__main__":
    fire.Fire(main)
--- a/docs/openapi_generator/pyopenapi/README.md
+++ b/docs/openapi_generator/pyopenapi/README.md
@ -1 +0,0 @@
 This is forked from https://github.com/hunyadi/pyopenapi
--- a/docs/openapi_generator/pyopenapi/init.py
+++ b/docs/openapi_generator/pyopenapi/init.py
@ -1,5 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@ -1,938 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import hashlib
 import ipaddress
 import types
 import typing
 from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union
 from llama_stack.apis.datatypes import Error
 from llama_stack.strong_typing.core import JsonType
 from llama_stack.strong_typing.docstring import Docstring, parse_type
 from llama_stack.strong_typing.inspection import (
    is_generic_list,
    is_type_optional,
    is_type_union,
    unwrap_generic_list,
    unwrap_optional_type,
    unwrap_union_types,
 )
 from llama_stack.strong_typing.name import python_type_to_name
 from llama_stack.strong_typing.schema import (
    get_schema_identifier,
    JsonSchemaGenerator,
    register_schema,
    Schema,
    SchemaOptions,
 )
 from typing import get_origin, get_args
 from typing import Annotated
 from fastapi import UploadFile
 from llama_stack.strong_typing.serialization import json_dump_string, object_to_json
 from .operations import (
    EndpointOperation,
    get_endpoint_events,
    get_endpoint_operations,
    HTTPMethod,
 )
 from .options import *
 from .specification import (
    Components,
    Document,
    Example,
    ExampleRef,
    MediaType,
    Operation,
    Parameter,
    ParameterLocation,
    PathItem,
    RequestBody,
    Response,
    ResponseRef,
    SchemaOrRef,
    SchemaRef,
    Tag,
    TagGroup,
 )
 register_schema(
    ipaddress.IPv4Address,
    schema={
        "type": "string",
        "format": "ipv4",
        "title": "IPv4 address",
        "description": "IPv4 address, according to dotted-quad ABNF syntax as defined in RFC 2673, section 3.2.",
    },
    examples=["192.0.2.0", "198.51.100.1", "203.0.113.255"],
 )
 register_schema(
    ipaddress.IPv6Address,
    schema={
        "type": "string",
        "format": "ipv6",
        "title": "IPv6 address",
        "description": "IPv6 address, as defined in RFC 2373, section 2.2.",
    },
    examples=[
        "FEDC:BA98:7654:3210:FEDC:BA98:7654:3210",
        "1080:0:0:0:8:800:200C:417A",
        "1080::8:800:200C:417A",
        "FF01::101",
        "::1",
    ],
 )
 def http_status_to_string(status_code: HTTPStatusCode) -> str:
    "Converts an HTTP status code to a string."
    if isinstance(status_code, HTTPStatus):
        return str(status_code.value)
    elif isinstance(status_code, int):
        return str(status_code)
    elif isinstance(status_code, str):
        return status_code
    else:
        raise TypeError("expected: HTTP status code")
 class SchemaBuilder:
    schema_generator: JsonSchemaGenerator
    schemas: Dict[str, Schema]
    def __init__(self, schema_generator: JsonSchemaGenerator) -> None:
        self.schema_generator = schema_generator
        self.schemas = {}
    def classdef_to_schema(self, typ: type) -> Schema:
        """
        Converts a type to a JSON schema.
        For nested types found in the type hierarchy, adds the type to the schema registry in the OpenAPI specification section `components`.
        """
        type_schema, type_definitions = self.schema_generator.classdef_to_schema(typ)
        # append schema to list of known schemas, to be used in OpenAPI's Components Object section
        for ref, schema in type_definitions.items():
            self._add_ref(ref, schema)
        return type_schema
    def classdef_to_named_schema(self, name: str, typ: type) -> Schema:
        schema = self.classdef_to_schema(typ)
        self._add_ref(name, schema)
        return schema
    def classdef_to_ref(self, typ: type) -> SchemaOrRef:
        """
        Converts a type to a JSON schema, and if possible, returns a schema reference.
        For composite types (such as classes), adds the type to the schema registry in the OpenAPI specification section `components`.
        """
        type_schema = self.classdef_to_schema(typ)
        if typ is str or typ is int or typ is float:
            # represent simple types as themselves
            return type_schema
        type_name = get_schema_identifier(typ)
        if type_name is not None:
            return self._build_ref(type_name, type_schema)
        try:
            type_name = python_type_to_name(typ)
            return self._build_ref(type_name, type_schema)
        except TypeError:
            pass
        return type_schema
    def _build_ref(self, type_name: str, type_schema: Schema) -> SchemaRef:
        self._add_ref(type_name, type_schema)
        return SchemaRef(type_name)
    def _add_ref(self, type_name: str, type_schema: Schema) -> None:
        if type_name not in self.schemas:
            self.schemas[type_name] = type_schema
 class ContentBuilder:
    schema_builder: SchemaBuilder
    schema_transformer: Optional[Callable[[SchemaOrRef], SchemaOrRef]]
    sample_transformer: Optional[Callable[[JsonType], JsonType]]
    def __init__(
        self,
        schema_builder: SchemaBuilder,
        schema_transformer: Optional[Callable[[SchemaOrRef], SchemaOrRef]] = None,
        sample_transformer: Optional[Callable[[JsonType], JsonType]] = None,
    ) -> None:
        self.schema_builder = schema_builder
        self.schema_transformer = schema_transformer
        self.sample_transformer = sample_transformer
    def build_content(
        self, payload_type: type, examples: Optional[List[Any]] = None
    ) -> Dict[str, MediaType]:
        "Creates the content subtree for a request or response."
        def is_iterator_type(t):
            return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
        def get_media_type(t):
            if is_generic_list(t):
                return "application/jsonl"
            elif is_iterator_type(t):
                return "text/event-stream"
            else:
                return "application/json"
        if typing.get_origin(payload_type) in (typing.Union, types.UnionType):
            media_types = []
            item_types = []
            for x in typing.get_args(payload_type):
                media_types.append(get_media_type(x))
                item_types.append(x)
            if len(set(media_types)) == 1:
                # all types have the same media type
                return {media_types[0]: self.build_media_type(payload_type, examples)}
            else:
                # different types have different media types
                return {
                    media_type: self.build_media_type(item_type, examples)
                    for media_type, item_type in zip(media_types, item_types)
                }
        if is_generic_list(payload_type):
            media_type = "application/jsonl"
            item_type = unwrap_generic_list(payload_type)
        else:
            media_type = "application/json"
            item_type = payload_type
        return {media_type: self.build_media_type(item_type, examples)}
    def build_media_type(
        self, item_type: type, examples: Optional[List[Any]] = None
    ) -> MediaType:
        schema = self.schema_builder.classdef_to_ref(item_type)
        if self.schema_transformer:
            schema_transformer: Callable[[SchemaOrRef], SchemaOrRef] = (
                self.schema_transformer
            )
            schema = schema_transformer(schema)
        if not examples:
            return MediaType(schema=schema)
        if len(examples) == 1:
            return MediaType(schema=schema, example=self._build_example(examples[0]))
        return MediaType(
            schema=schema,
            examples=self._build_examples(examples),
        )
    def _build_examples(
        self, examples: List[Any]
    ) -> Dict[str, Union[Example, ExampleRef]]:
        "Creates a set of several examples for a media type."
        if self.sample_transformer:
            sample_transformer: Callable[[JsonType], JsonType] = self.sample_transformer  # type: ignore
        else:
            sample_transformer = lambda sample: sample
        results: Dict[str, Union[Example, ExampleRef]] = {}
        for example in examples:
            value = sample_transformer(object_to_json(example))
            hash_string = (
                hashlib.sha256(json_dump_string(value).encode("utf-8"))
                .digest()
                .hex()[:16]
            )
            name = f"ex-{hash_string}"
            results[name] = Example(value=value)
        return results
    def _build_example(self, example: Any) -> Any:
        "Creates a single example for a media type."
        if self.sample_transformer:
            sample_transformer: Callable[[JsonType], JsonType] = self.sample_transformer  # type: ignore
        else:
            sample_transformer = lambda sample: sample
        return sample_transformer(object_to_json(example))
@dataclass
 class ResponseOptions:
    """
    Configuration options for building a response for an operation.
    :param type_descriptions: Maps each response type to a textual description (if available).
    :param examples: A list of response examples.
    :param status_catalog: Maps each response type to an HTTP status code.
    :param default_status_code: HTTP status code assigned to responses that have no mapping.
    """
    type_descriptions: Dict[type, str]
    examples: Optional[List[Any]]
    status_catalog: Dict[type, HTTPStatusCode]
    default_status_code: HTTPStatusCode
@dataclass
 class StatusResponse:
    status_code: str
    types: List[type] = dataclasses.field(default_factory=list)
    examples: List[Any] = dataclasses.field(default_factory=list)
 def create_docstring_for_request(
    request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
 ) -> str:
    """Creates a ReST-style docstring for a dynamically generated request dataclass."""
    lines = ["\n"]  # Short description
    # Add parameter documentation in ReST format
    for name, type_ in fields:
        desc = doc_params.get(name, "")
        lines.append(f":param {name}: {desc}")
    return "\n".join(lines)
 class ResponseBuilder:
    content_builder: ContentBuilder
    def __init__(self, content_builder: ContentBuilder) -> None:
        self.content_builder = content_builder
    def _get_status_responses(
        self, options: ResponseOptions
    ) -> Dict[str, StatusResponse]:
        status_responses: Dict[str, StatusResponse] = {}
        for response_type in options.type_descriptions.keys():
            status_code = http_status_to_string(
                options.status_catalog.get(response_type, options.default_status_code)
            )
            # look up response for status code
            if status_code not in status_responses:
                status_responses[status_code] = StatusResponse(status_code)
            status_response = status_responses[status_code]
            # append response types that are assigned the given status code
            status_response.types.append(response_type)
            # append examples that have the matching response type
            if options.examples:
                status_response.examples.extend(
                    example
                    for example in options.examples
                    if isinstance(example, response_type)
                )
        return dict(sorted(status_responses.items()))
    def build_response(
        self, options: ResponseOptions
    ) -> Dict[str, Union[Response, ResponseRef]]:
        """
        Groups responses that have the same status code.
        """
        responses: Dict[str, Union[Response, ResponseRef]] = {}
        status_responses = self._get_status_responses(options)
        for status_code, status_response in status_responses.items():
            response_types = tuple(status_response.types)
            if len(response_types) > 1:
                composite_response_type: type = Union[response_types]  # type: ignore
            else:
                (response_type,) = response_types
                composite_response_type = response_type
            description = " **OR** ".join(
                filter(
                    None,
                    (
                        options.type_descriptions[response_type]
                        for response_type in response_types
                    ),
                )
            )
            responses[status_code] = self._build_response(
                response_type=composite_response_type,
                description=description,
                examples=status_response.examples or None,
            )
        return responses
    def _build_response(
        self,
        response_type: type,
        description: str,
        examples: Optional[List[Any]] = None,
    ) -> Response:
        "Creates a response subtree."
        if response_type is not None:
            return Response(
                description=description,
                content=self.content_builder.build_content(response_type, examples),
            )
        else:
            return Response(description=description)
 def schema_error_wrapper(schema: SchemaOrRef) -> Schema:
    "Wraps an error output schema into a top-level error schema."
    return {
        "type": "object",
        "properties": {
            "error": schema,  # type: ignore
        },
        "additionalProperties": False,
        "required": [
            "error",
        ],
    }
 def sample_error_wrapper(error: JsonType) -> JsonType:
    "Wraps an error output sample into a top-level error sample."
    return {"error": error}
 class Generator:
    endpoint: type
    options: Options
    schema_builder: SchemaBuilder
    responses: Dict[str, Response]
    def __init__(self, endpoint: type, options: Options) -> None:
        self.endpoint = endpoint
        self.options = options
        schema_generator = JsonSchemaGenerator(
            SchemaOptions(
                definitions_path="#/components/schemas/",
                use_examples=self.options.use_examples,
                property_description_fun=options.property_description_fun,
            )
        )
        self.schema_builder = SchemaBuilder(schema_generator)
        self.responses = {}
        # Create standard error responses
        self._create_standard_error_responses()
    def _create_standard_error_responses(self) -> None:
        """
        Creates standard error responses that can be reused across operations.
        These will be added to the components.responses section of the OpenAPI document.
        """
        # Get the Error schema
        error_schema = self.schema_builder.classdef_to_ref(Error)
        # Create standard error responses
        self.responses["BadRequest400"] = Response(
            description="The request was invalid or malformed",
            content={
                "application/json": MediaType(
                    schema=error_schema,
                    example={
                        "status": 400,
                        "title": "Bad Request",
                        "detail": "The request was invalid or malformed",
                    },
                )
            },
        )
        self.responses["TooManyRequests429"] = Response(
            description="The client has sent too many requests in a given amount of time",
            content={
                "application/json": MediaType(
                    schema=error_schema,
                    example={
                        "status": 429,
                        "title": "Too Many Requests",
                        "detail": "You have exceeded the rate limit. Please try again later.",
                    },
                )
            },
        )
        self.responses["InternalServerError500"] = Response(
            description="The server encountered an unexpected error",
            content={
                "application/json": MediaType(
                    schema=error_schema,
                    example={
                        "status": 500,
                        "title": "Internal Server Error",
                        "detail": "An unexpected error occurred. Our team has been notified.",
                    },
                )
            },
        )
        # Add a default error response for any unhandled error cases
        self.responses["DefaultError"] = Response(
            description="An unexpected error occurred",
            content={
                "application/json": MediaType(
                    schema=error_schema,
                    example={
                        "status": 0,
                        "title": "Error",
                        "detail": "An unexpected error occurred",
                    },
                )
            },
        )
    def _build_type_tag(self, ref: str, schema: Schema) -> Tag:
        # Don't include schema definition in the tag description because for one,
        # it is not very valuable and for another, it causes string formatting
        # discrepancies via the Stainless Studio.
        #
        # definition = f'<SchemaDefinition schemaRef="#/components/schemas/{ref}" />'
        title = typing.cast(str, schema.get("title"))
        description = typing.cast(str, schema.get("description"))
        return Tag(
            name=ref,
            description="\n\n".join(s for s in (title, description) if s is not None),
        )
    def _build_extra_tag_groups(
        self, extra_types: Dict[str, Dict[str, type]]
    ) -> Dict[str, List[Tag]]:
        """
        Creates a dictionary of tag group captions as keys, and tag lists as values.
        :param extra_types: A dictionary of type categories and list of types in that category.
        """
        extra_tags: Dict[str, List[Tag]] = {}
        for category_name, category_items in extra_types.items():
            tag_list: List[Tag] = []
            for name, extra_type in category_items.items():
                schema = self.schema_builder.classdef_to_schema(extra_type)
                tag_list.append(self._build_type_tag(name, schema))
            if tag_list:
                extra_tags[category_name] = tag_list
        return extra_tags
    def _build_operation(self, op: EndpointOperation) -> Operation:
        if op.defining_class.__name__ in [
            "SyntheticDataGeneration",
            "PostTraining",
            "BatchInference",
        ]:
            op.defining_class.__name__ = f"{op.defining_class.__name__} (Coming Soon)"
            print(op.defining_class.__name__)
        # TODO (xiyan): temporary fix for datasetio inner impl + datasets api
        # if op.defining_class.__name__ in ["DatasetIO"]:
        #     op.defining_class.__name__ = "Datasets"
        doc_string = parse_type(op.func_ref)
        doc_params = dict(
            (param.name, param.description) for param in doc_string.params.values()
        )
        # parameters passed in URL component path
        path_parameters = [
            Parameter(
                name=param_name,
                in_=ParameterLocation.Path,
                description=doc_params.get(param_name),
                required=True,
                schema=self.schema_builder.classdef_to_ref(param_type),
            )
            for param_name, param_type in op.path_params
        ]
        # parameters passed in URL component query string
        query_parameters = []
        for param_name, param_type in op.query_params:
            if is_type_optional(param_type):
                inner_type: type = unwrap_optional_type(param_type)
                required = False
            else:
                inner_type = param_type
                required = True
            query_parameter = Parameter(
                name=param_name,
                in_=ParameterLocation.Query,
                description=doc_params.get(param_name),
                required=required,
                schema=self.schema_builder.classdef_to_ref(inner_type),
            )
            query_parameters.append(query_parameter)
        # parameters passed anywhere
        parameters = path_parameters + query_parameters
        webmethod = getattr(op.func_ref, "__webmethod__", None)
        raw_bytes_request_body = False
        if webmethod:
            raw_bytes_request_body = getattr(webmethod, "raw_bytes_request_body", False)
        # data passed in request body as raw bytes cannot have request parameters
        if raw_bytes_request_body and op.request_params:
            raise ValueError(
                "Cannot have both raw bytes request body and request parameters"
            )
        # data passed in request body as raw bytes
        if raw_bytes_request_body:
            requestBody = RequestBody(
                content={
                    "application/octet-stream": {
                        "schema": {
                            "type": "string",
                            "format": "binary",
                        }
                    }
                },
                required=True,
            )
        # data passed in request body as multipart/form-data
        elif op.multipart_params:
            builder = ContentBuilder(self.schema_builder)
            # Create schema properties for multipart form fields
            properties = {}
            required_fields = []
            for name, param_type in op.multipart_params:
                if get_origin(param_type) is Annotated:
                    base_type = get_args(param_type)[0]
                else:
                    base_type = param_type
                if base_type is UploadFile:
                    # File upload
                    properties[name] = {
                        "type": "string",
                        "format": "binary"
                    }
                else:
                    # Form field
                    properties[name] = self.schema_builder.classdef_to_ref(base_type)
                required_fields.append(name)
            multipart_schema = {
                "type": "object",
                "properties": properties,
                "required": required_fields
            }
            requestBody = RequestBody(
                content={
                    "multipart/form-data": {
                        "schema": multipart_schema
                    }
                },
                required=True,
            )
        # data passed in payload as JSON and mapped to request parameters
        elif op.request_params:
            builder = ContentBuilder(self.schema_builder)
            first = next(iter(op.request_params))
            request_name, request_type = first
            op_name = "".join(word.capitalize() for word in op.name.split("_"))
            request_name = f"{op_name}Request"
            fields = [
                (
                    name,
                    type_,
                )
                for name, type_ in op.request_params
            ]
            request_type = make_dataclass(
                request_name,
                fields,
                namespace={
                    "__doc__": create_docstring_for_request(
                        request_name, fields, doc_params
                    )
                },
            )
            requestBody = RequestBody(
                content={
                    "application/json": builder.build_media_type(
                        request_type, op.request_examples
                    )
                },
                description=doc_params.get(request_name),
                required=True,
            )
        else:
            requestBody = None
        # success response types
        if doc_string.returns is None and is_type_union(op.response_type):
            # split union of return types into a list of response types
            success_type_docstring: Dict[type, Docstring] = {
                typing.cast(type, item): parse_type(item)
                for item in unwrap_union_types(op.response_type)
            }
            success_type_descriptions = {
                item: doc_string.short_description
                for item, doc_string in success_type_docstring.items()
            }
        else:
            # use return type as a single response type
            success_type_descriptions = {
                op.response_type: (
                    doc_string.returns.description if doc_string.returns else "OK"
                )
            }
        response_examples = op.response_examples or []
        success_examples = [
            example
            for example in response_examples
            if not isinstance(example, Exception)
        ]
        content_builder = ContentBuilder(self.schema_builder)
        response_builder = ResponseBuilder(content_builder)
        response_options = ResponseOptions(
            success_type_descriptions,
            success_examples if self.options.use_examples else None,
            self.options.success_responses,
            "200",
        )
        responses = response_builder.build_response(response_options)
        # failure response types
        if doc_string.raises:
            exception_types: Dict[type, str] = {
                item.raise_type: item.description for item in doc_string.raises.values()
            }
            exception_examples = [
                example
                for example in response_examples
                if isinstance(example, Exception)
            ]
            if self.options.error_wrapper:
                schema_transformer = schema_error_wrapper
                sample_transformer = sample_error_wrapper
            else:
                schema_transformer = None
                sample_transformer = None
            content_builder = ContentBuilder(
                self.schema_builder,
                schema_transformer=schema_transformer,
                sample_transformer=sample_transformer,
            )
            response_builder = ResponseBuilder(content_builder)
            response_options = ResponseOptions(
                exception_types,
                exception_examples if self.options.use_examples else None,
                self.options.error_responses,
                "500",
            )
            responses.update(response_builder.build_response(response_options))
        assert len(responses.keys()) > 0, f"No responses found for {op.name}"
        # Add standard error response references
        if self.options.include_standard_error_responses:
            if "400" not in responses:
                responses["400"] = ResponseRef("BadRequest400")
            if "429" not in responses:
                responses["429"] = ResponseRef("TooManyRequests429")
            if "500" not in responses:
                responses["500"] = ResponseRef("InternalServerError500")
            if "default" not in responses:
                responses["default"] = ResponseRef("DefaultError")
        if op.event_type is not None:
            builder = ContentBuilder(self.schema_builder)
            callbacks = {
                f"{op.func_name}_callback": {
                    "{$request.query.callback}": PathItem(
                        post=Operation(
                            requestBody=RequestBody(
                                content=builder.build_content(op.event_type)
                            ),
                            responses={"200": Response(description="OK")},
                        )
                    )
                }
            }
        else:
            callbacks = None
        description = "\n".join(
            filter(None, [doc_string.short_description, doc_string.long_description])
        )
        return Operation(
            tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
            summary=None,
            # summary=doc_string.short_description,
            description=description,
            parameters=parameters,
            requestBody=requestBody,
            responses=responses,
            callbacks=callbacks,
            deprecated=True if "DEPRECATED" in op.func_name else None,
            security=[] if op.public else None,
        )
    def generate(self) -> Document:
        paths: Dict[str, PathItem] = {}
        endpoint_classes: Set[type] = set()
        for op in get_endpoint_operations(
            self.endpoint, use_examples=self.options.use_examples
        ):
            endpoint_classes.add(op.defining_class)
            operation = self._build_operation(op)
            if op.http_method is HTTPMethod.GET:
                pathItem = PathItem(get=operation)
            elif op.http_method is HTTPMethod.PUT:
                pathItem = PathItem(put=operation)
            elif op.http_method is HTTPMethod.POST:
                pathItem = PathItem(post=operation)
            elif op.http_method is HTTPMethod.DELETE:
                pathItem = PathItem(delete=operation)
            elif op.http_method is HTTPMethod.PATCH:
                pathItem = PathItem(patch=operation)
            else:
                raise NotImplementedError(f"unknown HTTP method: {op.http_method}")
            route = op.get_route()
            route = route.replace(":path", "")
            print(f"route: {route}")
            if route in paths:
                paths[route].update(pathItem)
            else:
                paths[route] = pathItem
        operation_tags: List[Tag] = []
        for cls in endpoint_classes:
            doc_string = parse_type(cls)
            if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
                continue
            operation_tags.append(
                Tag(
                    name=cls.__name__,
                    description=doc_string.long_description,
                    displayName=doc_string.short_description,
                )
            )
        # types that are emitted by events
        event_tags: List[Tag] = []
        events = get_endpoint_events(self.endpoint)
        for ref, event_type in events.items():
            event_schema = self.schema_builder.classdef_to_named_schema(ref, event_type)
            event_tags.append(self._build_type_tag(ref, event_schema))
        # types that are explicitly declared
        extra_tag_groups: Dict[str, List[Tag]] = {}
        if self.options.extra_types is not None:
            if isinstance(self.options.extra_types, list):
                extra_tag_groups = self._build_extra_tag_groups(
                    {"AdditionalTypes": self.options.extra_types}
                )
            elif isinstance(self.options.extra_types, dict):
                extra_tag_groups = self._build_extra_tag_groups(
                    self.options.extra_types
                )
            else:
                raise TypeError(
                    f"type mismatch for collection of extra types: {type(self.options.extra_types)}"
                )
        # list all operations and types
        tags: List[Tag] = []
        tags.extend(operation_tags)
        tags.extend(event_tags)
        for extra_tag_group in extra_tag_groups.values():
            tags.extend(extra_tag_group)
        tags = sorted(tags, key=lambda t: t.name)
        tag_groups = []
        if operation_tags:
            tag_groups.append(
                TagGroup(
                    name=self.options.map("Operations"),
                    tags=sorted(tag.name for tag in operation_tags),
                )
            )
        if event_tags:
            tag_groups.append(
                TagGroup(
                    name=self.options.map("Events"),
                    tags=sorted(tag.name for tag in event_tags),
                )
            )
        for caption, extra_tag_group in extra_tag_groups.items():
            tag_groups.append(
                TagGroup(
                    name=caption,
                    tags=sorted(tag.name for tag in extra_tag_group),
                )
            )
        if self.options.default_security_scheme:
            securitySchemes = {"Default": self.options.default_security_scheme}
        else:
            securitySchemes = None
        return Document(
            openapi=".".join(str(item) for item in self.options.version),
            info=self.options.info,
            jsonSchemaDialect=(
                "https://json-schema.org/draft/2020-12/schema"
                if self.options.version >= (3, 1, 0)
                else None
            ),
            servers=[self.options.server],
            paths=paths,
            components=Components(
                schemas=self.schema_builder.schemas,
                responses=self.responses,
                securitySchemes=securitySchemes,
            ),
            security=[{"Default": []}],
            tags=tags,
            tagGroups=tag_groups,
        )
--- a/docs/openapi_generator/pyopenapi/operations.py
+++ b/docs/openapi_generator/pyopenapi/operations.py
@ -1,424 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import collections.abc
 import enum
 import inspect
 import typing
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION
 from termcolor import colored
 from llama_stack.strong_typing.inspection import get_signature
 from typing import get_origin, get_args
 from fastapi import UploadFile 
 from fastapi.params import File, Form
 from typing import Annotated
 def split_prefix(
    s: str, sep: str, prefix: Union[str, Iterable[str]]
 ) -> Tuple[Optional[str], str]:
    """
    Recognizes a prefix at the beginning of a string.
    :param s: The string to check.
    :param sep: A separator between (one of) the prefix(es) and the rest of the string.
    :param prefix: A string or a set of strings to identify as a prefix.
    :return: A tuple of the recognized prefix (if any) and the rest of the string excluding the separator (or the entire string).
    """
    if isinstance(prefix, str):
        if s.startswith(prefix + sep):
            return prefix, s[len(prefix) + len(sep) :]
        else:
            return None, s
    for p in prefix:
        if s.startswith(p + sep):
            return p, s[len(p) + len(sep) :]
    return None, s
 def _get_annotation_type(annotation: Union[type, str], callable: Callable) -> type:
    "Maps a stringized reference to a type, as if using `from __future__ import annotations`."
    if isinstance(annotation, str):
        return eval(annotation, callable.__globals__)
    else:
        return annotation
 class HTTPMethod(enum.Enum):
    "HTTP method used to invoke an endpoint operation."
    GET = "GET"
    POST = "POST"
    PUT = "PUT"
    DELETE = "DELETE"
    PATCH = "PATCH"
 OperationParameter = Tuple[str, type]
 class ValidationError(TypeError):
    pass
@dataclass
 class EndpointOperation:
    """
    Type information and metadata associated with an endpoint operation.
    "param defining_class: The most specific class that defines the endpoint operation.
    :param name: The short name of the endpoint operation.
    :param func_name: The name of the function to invoke when the operation is triggered.
    :param func_ref: The callable to invoke when the operation is triggered.
    :param route: A custom route string assigned to the operation.
    :param path_params: Parameters of the operation signature that are passed in the path component of the URL string.
    :param query_params: Parameters of the operation signature that are passed in the query string as `key=value` pairs.
    :param request_params: The parameter that corresponds to the data transmitted in the request body.
    :param multipart_params: Parameters that indicate multipart/form-data request body.
    :param event_type: The Python type of the data that is transmitted out-of-band (e.g. via websockets) while the operation is in progress.
    :param response_type: The Python type of the data that is transmitted in the response body.
    :param http_method: The HTTP method used to invoke the endpoint such as POST, GET or PUT.
    :param public: True if the operation can be invoked without prior authentication.
    :param request_examples: Sample requests that the operation might take.
    :param response_examples: Sample responses that the operation might produce.
    """
    defining_class: type
    name: str
    func_name: str
    func_ref: Callable[..., Any]
    route: Optional[str]
    path_params: List[OperationParameter]
    query_params: List[OperationParameter]
    request_params: Optional[OperationParameter]
    multipart_params: List[OperationParameter]
    event_type: Optional[type]
    response_type: type
    http_method: HTTPMethod
    public: bool
    request_examples: Optional[List[Any]] = None
    response_examples: Optional[List[Any]] = None
    def get_route(self) -> str:
        if self.route is not None:
            return "/".join(["", LLAMA_STACK_API_VERSION, self.route.lstrip("/")])
        route_parts = ["", LLAMA_STACK_API_VERSION, self.name]
        for param_name, _ in self.path_params:
            route_parts.append("{" + param_name + "}")
        return "/".join(route_parts)
 class _FormatParameterExtractor:
    "A visitor to exract parameters in a format string."
    keys: List[str]
    def __init__(self) -> None:
        self.keys = []
    def __getitem__(self, key: str) -> None:
        self.keys.append(key)
        return None
 def _get_route_parameters(route: str) -> List[str]:
    extractor = _FormatParameterExtractor()
    # Replace all occurrences of ":path" with empty string
    route = route.replace(":path", "")
    route.format_map(extractor)
    return extractor.keys
 def _get_endpoint_functions(
    endpoint: type, prefixes: List[str]
 ) -> Iterator[Tuple[str, str, str, Callable]]:
    if not inspect.isclass(endpoint):
        raise ValueError(f"object is not a class type: {endpoint}")
    functions = inspect.getmembers(endpoint, inspect.isfunction)
    for func_name, func_ref in functions:
        webmethod = getattr(func_ref, "__webmethod__", None)
        if not webmethod:
            continue
        print(f"Processing {colored(func_name, 'white')}...")
        operation_name = func_name
        if webmethod.method == "GET":
            prefix = "get"
        elif webmethod.method == "DELETE":
            prefix = "delete"
        elif webmethod.method == "POST":
            prefix = "post"
        elif operation_name.startswith("get_") or operation_name.endswith("/get"):
            prefix = "get"
        elif (
            operation_name.startswith("delete_")
            or operation_name.startswith("remove_")
            or operation_name.endswith("/delete")
            or operation_name.endswith("/remove")
        ):
            prefix = "delete"
        else:
            # by default everything else is a POST
            prefix = "post"
        yield prefix, operation_name, func_name, func_ref
 def _get_defining_class(member_fn: str, derived_cls: type) -> type:
    "Find the class in which a member function is first defined in a class inheritance hierarchy."
    # This import must be dynamic here
    from llama_stack.apis.tools import RAGToolRuntime, ToolRuntime
    # iterate in reverse member resolution order to find most specific class first
    for cls in reversed(inspect.getmro(derived_cls)):
        for name, _ in inspect.getmembers(cls, inspect.isfunction):
            if name == member_fn:
                # HACK ALERT
                if cls == RAGToolRuntime:
                    return ToolRuntime
                return cls
    raise ValidationError(
        f"cannot find defining class for {member_fn} in {derived_cls}"
    )
 def get_endpoint_operations(
    endpoint: type, use_examples: bool = True
 ) -> List[EndpointOperation]:
    """
    Extracts a list of member functions in a class eligible for HTTP interface binding.
    These member functions are expected to have a signature like
    ```
    async def get_object(self, uuid: str, version: int) -> Object:
        ...
    ```
    where the prefix `get_` translates to an HTTP GET, `object` corresponds to the name of the endpoint operation,
    `uuid` and `version` are mapped to route path elements in "/object/{uuid}/{version}", and `Object` becomes
    the response payload type, transmitted as an object serialized to JSON.
    If the member function has a composite class type in the argument list, it becomes the request payload type,
    and the caller is expected to provide the data as serialized JSON in an HTTP POST request.
    :param endpoint: A class with member functions that can be mapped to an HTTP endpoint.
    :param use_examples: Whether to return examples associated with member functions.
    """
    result = []
    for prefix, operation_name, func_name, func_ref in _get_endpoint_functions(
        endpoint,
        [
            "create",
            "delete",
            "do",
            "get",
            "post",
            "put",
            "remove",
            "set",
            "update",
        ],
    ):
        # extract routing information from function metadata
        webmethod = getattr(func_ref, "__webmethod__", None)
        if webmethod is not None:
            route = webmethod.route
            route_params = _get_route_parameters(route) if route is not None else None
            public = webmethod.public
            request_examples = webmethod.request_examples
            response_examples = webmethod.response_examples
        else:
            route = None
            route_params = None
            public = False
            request_examples = None
            response_examples = None
        # inspect function signature for path and query parameters, and request/response payload type
        signature = get_signature(func_ref)
        path_params = []
        query_params = []
        request_params = []
        multipart_params = []
        for param_name, parameter in signature.parameters.items():
            param_type = _get_annotation_type(parameter.annotation, func_ref)
            # omit "self" for instance methods
            if param_name == "self" and param_type is inspect.Parameter.empty:
                continue
            # check if all parameters have explicit type
            if parameter.annotation is inspect.Parameter.empty:
                raise ValidationError(
                    f"parameter '{param_name}' in function '{func_name}' has no type annotation"
                )
            is_multipart = _is_multipart_param(param_type)
            if prefix in ["get", "delete"]:
                if route_params is not None and param_name in route_params:
                    path_params.append((param_name, param_type))
                else:
                    query_params.append((param_name, param_type))
            else:
                if route_params is not None and param_name in route_params:
                    path_params.append((param_name, param_type))
                elif is_multipart:
                    multipart_params.append((param_name, param_type))
                else:
                    request_params.append((param_name, param_type))
        # check if function has explicit return type
        if signature.return_annotation is inspect.Signature.empty:
            raise ValidationError(
                f"function '{func_name}' has no return type annotation"
            )
        return_type = _get_annotation_type(signature.return_annotation, func_ref)
        # operations that produce events are labeled as Generator[YieldType, SendType, ReturnType]
        # where YieldType is the event type, SendType is None, and ReturnType is the immediate response type to the request
        if typing.get_origin(return_type) is collections.abc.Generator:
            event_type, send_type, response_type = typing.get_args(return_type)
            if send_type is not type(None):
                raise ValidationError(
                    f"function '{func_name}' has a return type Generator[Y,S,R] and therefore looks like an event but has an explicit send type"
                )
        else:
            event_type = None
            def process_type(t):
                if typing.get_origin(t) is collections.abc.AsyncIterator:
                    # NOTE(ashwin): this is SSE and there is no way to represent it. either we make it a List
                    # or the item type. I am choosing it to be the latter
                    args = typing.get_args(t)
                    return args[0]
                elif typing.get_origin(t) is typing.Union:
                    types = [process_type(a) for a in typing.get_args(t)]
                    return typing._UnionGenericAlias(typing.Union, tuple(types))
                else:
                    return t
            response_type = process_type(return_type)
            if prefix in ["delete", "remove"]:
                http_method = HTTPMethod.DELETE
            elif prefix == "post":
                http_method = HTTPMethod.POST
            elif prefix == "get":
                http_method = HTTPMethod.GET
            elif prefix == "set":
                http_method = HTTPMethod.PUT
            elif prefix == "update":
                http_method = HTTPMethod.PATCH
            else:
                raise ValidationError(f"unknown prefix {prefix}")
        result.append(
            EndpointOperation(
                defining_class=_get_defining_class(func_name, endpoint),
                name=operation_name,
                func_name=func_name,
                func_ref=func_ref,
                route=route,
                path_params=path_params,
                query_params=query_params,
                request_params=request_params,
                multipart_params=multipart_params,
                event_type=event_type,
                response_type=response_type,
                http_method=http_method,
                public=public,
                request_examples=request_examples if use_examples else None,
                response_examples=response_examples if use_examples else None,
            )
        )
    if not result:
        raise ValidationError(f"no eligible endpoint operations in type {endpoint}")
    return result
 def get_endpoint_events(endpoint: type) -> Dict[str, type]:
    results = {}
    for decl in typing.get_type_hints(endpoint).values():
        # check if signature is Callable[...]
        origin = typing.get_origin(decl)
        if origin is None or not issubclass(origin, Callable):  # type: ignore
            continue
        # check if signature is Callable[[...], Any]
        args = typing.get_args(decl)
        if len(args) != 2:
            continue
        params_type, return_type = args
        if not isinstance(params_type, list):
            continue
        # check if signature is Callable[[...], None]
        if not issubclass(return_type, type(None)):
            continue
        # check if signature is Callable[[EventType], None]
        if len(params_type) != 1:
            continue
        param_type = params_type[0]
        results[param_type.__name__] = param_type
    return results
 def _is_multipart_param(param_type: type) -> bool:
    """
    Check if a parameter type indicates multipart form data.
    Returns True if the type is:
    - UploadFile
    - Annotated[UploadFile, File()]
    - Annotated[str, Form()]
    - Annotated[Any, File()]
    - Annotated[Any, Form()]
    """
    if param_type is UploadFile:
        return True
    # Check for Annotated types
    origin = get_origin(param_type)
    if origin is None:
        return False
    if origin is Annotated:
        args = get_args(param_type)
        if len(args) < 2:
            return False
        # Check the annotations for File() or Form()
        for annotation in args[1:]:
            if isinstance(annotation, (File, Form)):
                return True
    return False
--- a/docs/openapi_generator/pyopenapi/options.py
+++ b/docs/openapi_generator/pyopenapi/options.py
@ -1,77 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import dataclasses
 from dataclasses import dataclass
 from http import HTTPStatus
 from typing import Callable, ClassVar, Dict, List, Optional, Tuple, Union
 from .specification import (
    Info,
    SecurityScheme,
    SecuritySchemeAPI,
    SecuritySchemeHTTP,
    SecuritySchemeOpenIDConnect,
    Server,
 )
 HTTPStatusCode = Union[HTTPStatus, int, str]
@dataclass
 class Options:
    """
    :param server: Base URL for the API endpoint.
    :param info: Meta-information for the endpoint specification.
    :param version: OpenAPI specification version as a tuple of major, minor, revision.
    :param default_security_scheme: Security scheme to apply to endpoints, unless overridden on a per-endpoint basis.
    :param extra_types: Extra types in addition to those found in operation signatures. Use a dictionary to group related types.
    :param use_examples: Whether to emit examples for operations.
    :param success_responses: Associates operation response types with HTTP status codes.
    :param error_responses: Associates error response types with HTTP status codes.
    :param error_wrapper: True if errors are encapsulated in an error object wrapper.
    :param property_description_fun: Custom transformation function to apply to class property documentation strings.
    :param captions: User-defined captions for sections such as "Operations" or "Types", and (if applicable) groups of extra types.
    :param include_standard_error_responses: Whether to include standard error responses (400, 429, 500, 503) in all operations.
    """
    server: Server
    info: Info
    version: Tuple[int, int, int] = (3, 1, 0)
    default_security_scheme: Optional[SecurityScheme] = None
    extra_types: Union[List[type], Dict[str, List[type]], None] = None
    use_examples: bool = True
    success_responses: Dict[type, HTTPStatusCode] = dataclasses.field(
        default_factory=dict
    )
    error_responses: Dict[type, HTTPStatusCode] = dataclasses.field(
        default_factory=dict
    )
    error_wrapper: bool = False
    property_description_fun: Optional[Callable[[type, str, str], str]] = None
    captions: Optional[Dict[str, str]] = None
    include_standard_error_responses: bool = True
    default_captions: ClassVar[Dict[str, str]] = {
        "Operations": "Operations",
        "Types": "Types",
        "Events": "Events",
        "AdditionalTypes": "Additional types",
    }
    def map(self, id: str) -> str:
        "Maps a language-neutral placeholder string to language-dependent text."
        if self.captions is not None:
            caption = self.captions.get(id)
            if caption is not None:
                return caption
        caption = self.__class__.default_captions.get(id)
        if caption is not None:
            return caption
        raise KeyError(f"no caption found for ID: {id}")
--- a/docs/openapi_generator/pyopenapi/specification.py
+++ b/docs/openapi_generator/pyopenapi/specification.py
@ -1,259 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import dataclasses
 import enum
 from dataclasses import dataclass
 from typing import Any, ClassVar, Dict, List, Optional, Union
 from llama_stack.strong_typing.schema import JsonType, Schema, StrictJsonType
 URL = str
@dataclass
 class Ref:
    ref_type: ClassVar[str]
    id: str
    def to_json(self) -> StrictJsonType:
        return {"$ref": f"#/components/{self.ref_type}/{self.id}"}
@dataclass
 class SchemaRef(Ref):
    ref_type: ClassVar[str] = "schemas"
 SchemaOrRef = Union[Schema, SchemaRef]
@dataclass
 class ResponseRef(Ref):
    ref_type: ClassVar[str] = "responses"
@dataclass
 class ParameterRef(Ref):
    ref_type: ClassVar[str] = "parameters"
@dataclass
 class ExampleRef(Ref):
    ref_type: ClassVar[str] = "examples"
@dataclass
 class Contact:
    name: Optional[str] = None
    url: Optional[URL] = None
    email: Optional[str] = None
@dataclass
 class License:
    name: str
    url: Optional[URL] = None
@dataclass
 class Info:
    title: str
    version: str
    description: Optional[str] = None
    termsOfService: Optional[str] = None
    contact: Optional[Contact] = None
    license: Optional[License] = None
@dataclass
 class MediaType:
    schema: Optional[SchemaOrRef] = None
    example: Optional[Any] = None
    examples: Optional[Dict[str, Union["Example", ExampleRef]]] = None
@dataclass
 class RequestBody:
    content: Dict[str, MediaType | Dict[str, Any]]
    description: Optional[str] = None
    required: Optional[bool] = None
@dataclass
 class Response:
    description: str
    content: Optional[Dict[str, MediaType]] = None
 class ParameterLocation(enum.Enum):
    Query = "query"
    Header = "header"
    Path = "path"
    Cookie = "cookie"
@dataclass
 class Parameter:
    name: str
    in_: ParameterLocation
    description: Optional[str] = None
    required: Optional[bool] = None
    schema: Optional[SchemaOrRef] = None
    example: Optional[Any] = None
@dataclass
 class Operation:
    responses: Dict[str, Union[Response, ResponseRef]]
    tags: Optional[List[str]] = None
    summary: Optional[str] = None
    description: Optional[str] = None
    operationId: Optional[str] = None
    parameters: Optional[List[Parameter]] = None
    requestBody: Optional[RequestBody] = None
    callbacks: Optional[Dict[str, "Callback"]] = None
    security: Optional[List["SecurityRequirement"]] = None
    deprecated: Optional[bool] = None
@dataclass
 class PathItem:
    summary: Optional[str] = None
    description: Optional[str] = None
    get: Optional[Operation] = None
    put: Optional[Operation] = None
    post: Optional[Operation] = None
    delete: Optional[Operation] = None
    options: Optional[Operation] = None
    head: Optional[Operation] = None
    patch: Optional[Operation] = None
    trace: Optional[Operation] = None
    def update(self, other: "PathItem") -> None:
        "Merges another instance of this class into this object."
        for field in dataclasses.fields(self.__class__):
            value = getattr(other, field.name)
            if value is not None:
                setattr(self, field.name, value)
 # maps run-time expressions such as "$request.body#/url" to path items
 Callback = Dict[str, PathItem]
@dataclass
 class Example:
    summary: Optional[str] = None
    description: Optional[str] = None
    value: Optional[Any] = None
    externalValue: Optional[URL] = None
@dataclass
 class Server:
    url: URL
    description: Optional[str] = None
 class SecuritySchemeType(enum.Enum):
    ApiKey = "apiKey"
    HTTP = "http"
    OAuth2 = "oauth2"
    OpenIDConnect = "openIdConnect"
@dataclass
 class SecurityScheme:
    type: SecuritySchemeType
    description: str
@dataclass(init=False)
 class SecuritySchemeAPI(SecurityScheme):
    name: str
    in_: ParameterLocation
    def __init__(self, description: str, name: str, in_: ParameterLocation) -> None:
        super().__init__(SecuritySchemeType.ApiKey, description)
        self.name = name
        self.in_ = in_
@dataclass(init=False)
 class SecuritySchemeHTTP(SecurityScheme):
    scheme: str
    bearerFormat: Optional[str] = None
    def __init__(
        self, description: str, scheme: str, bearerFormat: Optional[str] = None
    ) -> None:
        super().__init__(SecuritySchemeType.HTTP, description)
        self.scheme = scheme
        self.bearerFormat = bearerFormat
@dataclass(init=False)
 class SecuritySchemeOpenIDConnect(SecurityScheme):
    openIdConnectUrl: str
    def __init__(self, description: str, openIdConnectUrl: str) -> None:
        super().__init__(SecuritySchemeType.OpenIDConnect, description)
        self.openIdConnectUrl = openIdConnectUrl
@dataclass
 class Components:
    schemas: Optional[Dict[str, Schema]] = None
    responses: Optional[Dict[str, Response]] = None
    parameters: Optional[Dict[str, Parameter]] = None
    examples: Optional[Dict[str, Example]] = None
    requestBodies: Optional[Dict[str, RequestBody]] = None
    securitySchemes: Optional[Dict[str, SecurityScheme]] = None
    callbacks: Optional[Dict[str, Callback]] = None
 SecurityScope = str
 SecurityRequirement = Dict[str, List[SecurityScope]]
@dataclass
 class Tag:
    name: str
    description: Optional[str] = None
    displayName: Optional[str] = None
@dataclass
 class TagGroup:
    """
    A ReDoc extension to provide information about groups of tags.
    Exposed via the vendor-specific property "x-tagGroups" of the top-level object.
    """
    name: str
    tags: List[str]
@dataclass
 class Document:
    """
    This class is a Python dataclass adaptation of the OpenAPI Specification.
    For details, see <https://swagger.io/specification/>
    """
    openapi: str
    info: Info
    servers: List[Server]
    paths: Dict[str, PathItem]
    jsonSchemaDialect: Optional[str] = None
    components: Optional[Components] = None
    security: Optional[List[SecurityRequirement]] = None
    tags: Optional[List[Tag]] = None
    tagGroups: Optional[List[TagGroup]] = None
--- a/docs/openapi_generator/pyopenapi/template.html
+++ b/docs/openapi_generator/pyopenapi/template.html
@ -1,41 +0,0 @@
 <!DOCTYPE html>
 <html>
 <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>OpenAPI specification</title>
    <link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
    <script type="module" src="https://cdn.jsdelivr.net/npm/@stoplight/elements/web-components.min.js"></script>
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@stoplight/elements/styles.min.css">
    <style>
        body {
            margin: 0;
            padding: 0;
            height: 100vh;
        }
        elements-api {
            height: 100%;
        }
    </style>
 </head>
 <body>
    <elements-api id="openapi-container" router="hash" layout="sidebar" hideExport="true"
        hideInternal="true"></elements-api>
    <script>
        document.addEventListener("DOMContentLoaded", function () {
            const spec = { /* OPENAPI_SPECIFICATION */ };
            const element = document.getElementById("openapi-container");
            element.apiDescriptionDocument = spec;
            if (spec.info && spec.info.title) {
                document.title = spec.info.title;
            }
        });
    </script>
 </body>
 </html>
--- a/docs/openapi_generator/pyopenapi/utility.py
+++ b/docs/openapi_generator/pyopenapi/utility.py
@ -1,268 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
 import typing
 import inspect
 from pathlib import Path
 from typing import TextIO
 from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
 from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
 from llama_stack.distribution.resolver import api_protocol_map
 from .generator import Generator
 from .options import Options
 from .specification import Document
 THIS_DIR = Path(__file__).parent
 class Specification:
    document: Document
    def __init__(self, endpoint: type, options: Options):
        generator = Generator(endpoint, options)
        self.document = generator.generate()
    def get_json(self) -> StrictJsonType:
        """
        Returns the OpenAPI specification as a Python data type (e.g. `dict` for an object, `list` for an array).
        The result can be serialized to a JSON string with `json.dump` or `json.dumps`.
        """
        json_doc = typing.cast(StrictJsonType, object_to_json(self.document))
        if isinstance(json_doc, dict):
            # rename vendor-specific properties
            tag_groups = json_doc.pop("tagGroups", None)
            if tag_groups:
                json_doc["x-tagGroups"] = tag_groups
            tags = json_doc.get("tags")
            if tags and isinstance(tags, list):
                for tag in tags:
                    if not isinstance(tag, dict):
                        continue
                    display_name = tag.pop("displayName", None)
                    if display_name:
                        tag["x-displayName"] = display_name
        return json_doc
    def get_json_string(self, pretty_print: bool = False) -> str:
        """
        Returns the OpenAPI specification as a JSON string.
        :param pretty_print: Whether to use line indents to beautify the output.
        """
        json_doc = self.get_json()
        if pretty_print:
            return json.dumps(
                json_doc, check_circular=False, ensure_ascii=False, indent=4
            )
        else:
            return json.dumps(
                json_doc,
                check_circular=False,
                ensure_ascii=False,
                separators=(",", ":"),
            )
    def write_json(self, f: TextIO, pretty_print: bool = False) -> None:
        """
        Writes the OpenAPI specification to a file as a JSON string.
        :param pretty_print: Whether to use line indents to beautify the output.
        """
        json_doc = self.get_json()
        if pretty_print:
            json.dump(
                json_doc,
                f,
                check_circular=False,
                ensure_ascii=False,
                indent=4,
            )
        else:
            json.dump(
                json_doc,
                f,
                check_circular=False,
                ensure_ascii=False,
                separators=(",", ":"),
            )
    def write_html(self, f: TextIO, pretty_print: bool = False) -> None:
        """
        Creates a stand-alone HTML page for the OpenAPI specification with ReDoc.
        :param pretty_print: Whether to use line indents to beautify the JSON string in the HTML file.
        """
        path = THIS_DIR / "template.html"
        with path.open(encoding="utf-8", errors="strict") as html_template_file:
            html_template = html_template_file.read()
        html = html_template.replace(
            "{ /* OPENAPI_SPECIFICATION */ }",
            self.get_json_string(pretty_print=pretty_print),
        )
        f.write(html)
 def is_optional_type(type_: Any) -> bool:
    """Check if a type is Optional."""
    origin = get_origin(type_)
    args = get_args(type_)
    return origin is Optional or (origin is Union and type(None) in args)
 def _validate_api_method_return_type(method) -> str | None:
    hints = get_type_hints(method)
    if 'return' not in hints:
        return "has no return type annotation"
    return_type = hints['return']
    if is_optional_type(return_type):
        return "returns Optional type where a return value is mandatory"
 def _validate_api_method_doesnt_return_list(method) -> str | None:
    hints = get_type_hints(method)
    if 'return' not in hints:
        return "has no return type annotation"
    return_type = hints['return']
    if get_origin(return_type) is list:
        return "returns a list where a PaginatedResponse or List*Response object is expected"
 def _validate_api_delete_method_returns_none(method) -> str | None:
    hints = get_type_hints(method)
    if 'return' not in hints:
        return "has no return type annotation"
    return_type = hints['return']
    # Allow OpenAI endpoints to return response objects since they follow OpenAI specification
    method_name = getattr(method, '__name__', '')
    if method_name.startswith('openai_'):
        return None
    if return_type is not None and return_type is not type(None):
        return "does not return None where None is mandatory"
 def _validate_list_parameters_contain_data(method) -> str | None:
    hints = get_type_hints(method)
    if 'return' not in hints:
        return "has no return type annotation"
    return_type = hints['return']
    if not inspect.isclass(return_type):
        return
    if not return_type.__name__.startswith('List'):
        return
    if 'data' not in return_type.model_fields:
        return "does not have a mandatory data attribute containing the list of objects"
 def _validate_has_ellipsis(method) -> str | None:
    source = inspect.getsource(method)
    if "..." not in source and not "NotImplementedError" in source:
        return "does not contain ellipsis (...) in its implementation"
 def _validate_has_return_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    return_type = method.__annotations__.get('return')
    if return_type is not None and return_type != type(None) and ":returns:" not in source:
        return "does not have a ':returns:' in its docstring"
 def _validate_has_params_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    sig = inspect.signature(method)
    # Only check if the method has more than one parameter
    if len(sig.parameters) > 1 and ":param" not in source:
        return "does not have a ':param' in its docstring"
 def _validate_has_no_return_none_in_docstring(method) -> str | None:
    source = inspect.getsource(method)
    return_type = method.__annotations__.get('return')
    if return_type is None and ":returns: None" in source:
        return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
 def _validate_docstring_lines_end_with_dot(method) -> str | None:
    docstring = inspect.getdoc(method)
    if docstring is None:
        return None
    lines = docstring.split('\n')
    for line in lines:
        line = line.strip()
        if line and not any(line.endswith(char) for char in '.:{}[]()",'):
            return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
 _VALIDATORS = {
    "GET": [
        _validate_api_method_return_type,
        _validate_list_parameters_contain_data,
        _validate_api_method_doesnt_return_list,
        _validate_has_ellipsis,
        _validate_has_return_in_docstring,
        _validate_has_params_in_docstring,
        _validate_docstring_lines_end_with_dot,
    ],
    "DELETE": [
        _validate_api_delete_method_returns_none,
        _validate_has_ellipsis,
        _validate_has_return_in_docstring,
        _validate_has_params_in_docstring,
        _validate_has_no_return_none_in_docstring
    ],
    "POST": [
        _validate_has_ellipsis,
        _validate_has_return_in_docstring,
        _validate_has_params_in_docstring,
        _validate_has_no_return_none_in_docstring,
        _validate_docstring_lines_end_with_dot,
    ],
 }
 def _get_methods_by_type(protocol, method_type: str):
    members = inspect.getmembers(protocol, predicate=inspect.isfunction)
    return {
        method_name: method
        for method_name, method in members
        if (webmethod := getattr(method, '__webmethod__', None))
        if webmethod and webmethod.method == method_type
    }
 def validate_api() -> List[str]:
    """Validate the API protocols."""
    errors = []
    protocols = api_protocol_map()
    for target, validators in _VALIDATORS.items():
        for protocol_name, protocol in protocols.items():
            for validator in validators:
                for method_name, method in _get_methods_by_type(protocol, target).items():
                    err = validator(method)
                    if err:
                        errors.append(f"Method {protocol_name}.{method_name} {err}")
    return errors
--- a/docs/openapi_generator/run_openapi_generator.sh
+++ b/docs/openapi_generator/run_openapi_generator.sh
@ -1,32 +0,0 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 PYTHONPATH=${PYTHONPATH:-}
 THIS_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
 set -euo pipefail
 missing_packages=()
 check_package() {
    if ! pip show "$1" &>/dev/null; then
        missing_packages+=("$1")
    fi
 }
 if [ ${#missing_packages[@]} -ne 0 ]; then
    echo "Error: The following package(s) are not installed:"
    printf " - %s\n" "${missing_packages[@]}"
    echo "Please install them using:"
    echo "pip install ${missing_packages[*]}"
    exit 1
 fi
 stack_dir=$(dirname $(dirname $THIS_DIR))
 PYTHONPATH=$PYTHONPATH:$stack_dir \
  python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/_static
--- a/docs/readme.md
+++ b/docs/readme.md
@ -1,19 +0,0 @@
 # Llama Stack Documentation
 Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
 ## Render locally
 From the llama-stack root directory, run the following command to render the docs locally:
 ```bash
 uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
 ```
 You can open up the docs in your browser at http://localhost:8000
 ## Content
 Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
 * [Building AI Applications Notebook](./getting_started.ipynb) - A comprehensive guide to building production-ready AI applications using Llama Stack
 * [Benchmark Evaluations Notebook](./notebooks/Llama_Stack_Benchmark_Evals.ipynb) - Detailed performance evaluations and benchmarking results
 * [Zero-to-Hero Guide](./zero_to_hero_guide) - Step-by-step guide for getting started with Llama Stack
--- a/docs/resources/agentic-system.png
+++ b/docs/resources/agentic-system.png
--- a/docs/resources/list-templates.png
+++ b/docs/resources/list-templates.png
--- a/docs/resources/llama-stack.png
+++ b/docs/resources/llama-stack.png
--- a/docs/resources/model-lifecycle.png
+++ b/docs/resources/model-lifecycle.png
--- a/docs/resources/prompt-format.png
+++ b/docs/resources/prompt-format.png
--- a/docs/source/building_applications/agent.md
+++ b/docs/source/building_applications/agent.md
@ -1,92 +0,0 @@
 # Agents
 An Agent in Llama Stack is a powerful abstraction that allows you to build complex AI applications.
 The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI
 applications. This document explains the key components and how they work together.
 ## Core Concepts
 ### 1. Agent Configuration
 Agents are configured using the `AgentConfig` class, which includes:
 - **Model**: The underlying LLM to power the agent
 - **Instructions**: System prompt that defines the agent's behavior
 - **Tools**: Capabilities the agent can use to interact with external systems
 - **Safety Shields**: Guardrails to ensure responsible AI behavior
 ```python
 from llama_stack_client import Agent
 # Create the agent
 agent = Agent(
    llama_stack_client,
    model="meta-llama/Llama-3-70b-chat",
    instructions="You are a helpful assistant that can use tools to answer questions.",
    tools=["builtin::code_interpreter", "builtin::rag/knowledge_search"],
 )
 ```
 ### 2. Sessions
 Agents maintain state through sessions, which represent a conversation thread:
 ```python
 # Create a session
 session_id = agent.create_session(session_name="My conversation")
 ```
 ### 3. Turns
 Each interaction with an agent is called a "turn" and consists of:
 - **Input Messages**: What the user sends to the agent
 - **Steps**: The agent's internal processing (inference, tool execution, etc.)
 - **Output Message**: The agent's response
 ```python
 from llama_stack_client import AgentEventLogger
 # Create a turn with streaming response
 turn_response = agent.create_turn(
    session_id=session_id,
    messages=[{"role": "user", "content": "Tell me about Llama models"}],
 )
 for log in AgentEventLogger().log(turn_response):
    log.print()
 ```
 ###  Non-Streaming
 ```python
 from rich.pretty import pprint
 # Non-streaming API
 response = agent.create_turn(
    session_id=session_id,
    messages=[{"role": "user", "content": "Tell me about Llama models"}],
    stream=False,
 )
 print("Inputs:")
 pprint(response.input_messages)
 print("Output:")
 pprint(response.output_message.content)
 print("Steps:")
 pprint(response.steps)
 ```
 ### 4. Steps
 Each turn consists of multiple steps that represent the agent's thought process:
 - **Inference Steps**: The agent generating text responses
 - **Tool Execution Steps**: The agent using tools to gather information
 - **Shield Call Steps**: Safety checks being performed
 ## Agent Execution Loop
 Refer to the [Agent Execution Loop](agent_execution_loop) for more details on what happens within an agent turn.
--- a/docs/source/building_applications/agent_execution_loop.md
+++ b/docs/source/building_applications/agent_execution_loop.md
@ -1,139 +0,0 @@
 ## Agent Execution Loop
 Agents are the heart of Llama Stack applications. They combine inference, memory, safety, and tool usage into coherent
 workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage,
 and safety checks.
 ### Steps in the Agent Workflow
 Each agent turn follows these key steps:
 1. **Initial Safety Check**: The user's input is first screened through configured safety shields
 2. **Context Retrieval**:
   - If RAG is enabled, the agent can choose to query relevant documents from memory banks. You can use the `instructions` field to steer the agent.
   - For new documents, they are first inserted into the memory bank.
   - Retrieved context is provided to the LLM as a tool response in the message history.
 3. **Inference Loop**: The agent enters its main execution loop:
   - The LLM receives a user prompt (with previous tool outputs)
   - The LLM generates a response, potentially with [tool calls](tools)
   - If tool calls are present:
     - Tool inputs are safety-checked
     - Tools are executed (e.g., web search, code execution)
     - Tool responses are fed back to the LLM for synthesis
   - The loop continues until:
     - The LLM provides a final response without tool calls
     - Maximum iterations are reached
     - Token limit is exceeded
 4. **Final Safety Check**: The agent's final response is screened through safety shields
 ```{mermaid}
 sequenceDiagram
    participant U as User
    participant E as Executor
    participant M as Memory Bank
    participant L as LLM
    participant T as Tools
    participant S as Safety Shield
    Note over U,S: Agent Turn Start
    U->>S: 1. Submit Prompt
    activate S
    S->>E: Input Safety Check
    deactivate S
    loop Inference Loop
        E->>L: 2.1 Augment with Context
        L-->>E: 2.2 Response (with/without tool calls)
        alt Has Tool Calls
            E->>S: Check Tool Input
            S->>T: 3.1 Execute Tool
            T-->>E: 3.2 Tool Response
            E->>L: 4.1 Tool Response
            L-->>E: 4.2 Synthesized Response
        end
        opt Stop Conditions
            Note over E: Break if:
            Note over E: - No tool calls
            Note over E: - Max iterations reached
            Note over E: - Token limit exceeded
        end
    end
    E->>S: Output Safety Check
    S->>U: 5. Final Response
 ```
 Each step in this process can be monitored and controlled through configurations.
 ### Agent Execution Loop Example
 Here's an example that demonstrates monitoring the agent's execution:
 ```python
 from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
 from rich.pretty import pprint
 # Replace host and port
 client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
 agent = Agent(
    client,
    # Check with `llama-stack-client models list`
    model="Llama3.2-3B-Instruct",
    instructions="You are a helpful assistant",
    # Enable both RAG and tool usage
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": ["my_docs"]},
        },
        "builtin::code_interpreter",
    ],
    # Configure safety (optional)
    input_shields=["llama_guard"],
    output_shields=["llama_guard"],
    # Control the inference loop
    max_infer_iters=5,
    sampling_params={
        "strategy": {"type": "top_p", "temperature": 0.7, "top_p": 0.95},
        "max_tokens": 2048,
    },
 )
 session_id = agent.create_session("monitored_session")
 # Stream the agent's execution steps
 response = agent.create_turn(
    messages=[{"role": "user", "content": "Analyze this code and run it"}],
    documents=[
        {
            "content": "https://raw.githubusercontent.com/example/code.py",
            "mime_type": "text/plain",
        }
    ],
    session_id=session_id,
 )
 # Monitor each step of execution
 for log in AgentEventLogger().log(response):
    log.print()
 # Using non-streaming API, the response contains input, steps, and output.
 response = agent.create_turn(
    messages=[{"role": "user", "content": "Analyze this code and run it"}],
    documents=[
        {
            "content": "https://raw.githubusercontent.com/example/code.py",
            "mime_type": "text/plain",
        }
    ],
    session_id=session_id,
 )
 pprint(f"Input: {response.input_messages}")
 pprint(f"Output: {response.output_message.content}")
 pprint(f"Steps: {response.steps}")
 ```
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@ -1,125 +0,0 @@
 # Evaluations
 The Llama Stack provides a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
 - `/eval` + `/benchmarks` API
 This guides walks you through the process of evaluating an LLM application built using Llama Stack. Checkout the [Evaluation Reference](../references/evals_reference/index.md) guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for benchmark and application use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 ## Application Evaluation
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
 Llama Stack offers a library of scoring functions and the `/scoring` API, allowing you to run evaluations on your pre-annotated AI application datasets.
 In this example, we will show you how to:
 1. Build an Agent with Llama Stack
 2. Query the agent's sessions, turns, and steps
 3. Evaluate the results.
 ##### Building a Search Agent
 ```python
 from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
 client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
 agent = Agent(
    client,
    model="meta-llama/Llama-3.3-70B-Instruct",
    instructions="You are a helpful assistant. Use search tool to answer the questions. ",
    tools=["builtin::websearch"],
 )
 user_prompts = [
    "Which teams played in the NBA Western Conference Finals of 2024. Search the web for the answer.",
    "In which episode and season of South Park does Bill Cosby (BSM-471) first appear? Give me the number and title. Search the web for the answer.",
    "What is the British-American kickboxer Andrew Tate's kickboxing name? Search the web for the answer.",
 ]
 session_id = agent.create_session("test-session")
 for prompt in user_prompts:
    response = agent.create_turn(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        session_id=session_id,
    )
    for log in AgentEventLogger().log(response):
        log.print()
 ```
 ##### Query Agent Execution Steps
 Now, let's look deeper into the agent's execution steps and see if how well our agent performs.
 ```python
 # query the agents session
 from rich.pretty import pprint
 session_response = client.agents.session.retrieve(
    session_id=session_id,
    agent_id=agent.agent_id,
 )
 pprint(session_response)
 ```
 As a sanity check, we will first check if all user prompts is followed by a tool call to `brave_search`.
 ```python
 num_tool_call = 0
 for turn in session_response.turns:
    for step in turn.steps:
        if (
            step.step_type == "tool_execution"
            and step.tool_calls[0].tool_name == "brave_search"
        ):
            num_tool_call += 1
 print(
    f"{num_tool_call}/{len(session_response.turns)} user prompts are followed by a tool call to `brave_search`"
 )
 ```
 ##### Evaluate Agent Responses
 Now, we want to evaluate the agent's responses to the user prompts.
 1. First, we will process the agent's execution history into a list of rows that can be used for evaluation.
 2. Next, we will label the rows with the expected answer.
 3. Finally, we will use the `/scoring` API to score the agent's responses.
 ```python
 eval_rows = []
 expected_answers = [
    "Dallas Mavericks and the Minnesota Timberwolves",
    "Season 4, Episode 12",
    "King Cobra",
 ]
 for i, turn in enumerate(session_response.turns):
    eval_rows.append(
        {
            "input_query": turn.input_messages[0].content,
            "generated_answer": turn.output_message.content,
            "expected_answer": expected_answers[i],
        }
    )
 pprint(eval_rows)
 scoring_params = {
    "basic::subset_of": None,
 }
 scoring_response = client.scoring.score(
    input_rows=eval_rows, scoring_functions=scoring_params
 )
 pprint(scoring_response)
 ```
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@ -1,30 +0,0 @@
 # Building AI Applications (Examples)
 Llama Stack provides all the building blocks needed to create sophisticated AI applications.
 The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.
 **Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
 Here are some key topics that will help you build effective agents:
 - **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
 - **[Agent](agent)**: Understand the components and design patterns of the Llama Stack agent framework.
 - **[Agent Execution Loop](agent_execution_loop)**: Understand how agents process information, make decisions, and execute actions in a continuous loop.
 - **[Tools](tools)**: Extend your agents' capabilities by integrating with external tools and APIs.
 - **[Evals](evals)**: Evaluate your agents' effectiveness and identify areas for improvement.
 - **[Telemetry](telemetry)**: Monitor and analyze your agents' performance and behavior.
 - **[Safety](safety)**: Implement guardrails and safety measures to ensure responsible AI behavior.
 ```{toctree}
 :hidden:
 :maxdepth: 1
 rag
 agent
 agent_execution_loop
 tools
 evals
 telemetry
 safety
 ```
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@ -1,259 +0,0 @@
 ## Retrieval Augmented Generation (RAG)
 RAG enables your applications to reference and recall information from previous interactions or external documents.
 Llama Stack organizes the APIs that enable RAG into three layers:
 1. The lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.).
 2. The next is the "Rag Tool", a first-class tool as part of the [Tools API](tools.md) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly.
 3. Finally, it all comes together with the top-level ["Agents" API](agent.md) that allows you to create agents that can use the tools to answer questions, perform tasks, and more.
 <img src="rag.png" alt="RAG System" width="50%">
 The RAG system uses lower-level storage for different types of data:
 * **Vector IO**: For semantic search and retrieval
 * **Key-Value and Relational IO**: For structured data storage
 We may add more storage types like Graph IO in the future.
 ### Setting up Vector DBs
 For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
 Ollama is an LLM runtime that allows you to run Llama models locally.
 Here's how to set up a vector database for RAG:
 ```python
 # Create http client
 import os
 from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
 # Register a vector db
 vector_db_id = "my_documents"
 response = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="all-MiniLM-L6-v2",
    embedding_dimension=384,
    provider_id="faiss",
 )
 ```
 ### Ingesting Documents
 You can ingest documents into the vector database using two methods: directly inserting pre-chunked
 documents or using the RAG Tool.
 ```python
 # You can insert a pre-chunked document directly into the vector db
 chunks = [
    {
        "content": "Your document text here",
        "mime_type": "text/plain",
        "metadata": {
            "document_id": "doc1",
            "author": "Jane Doe",
        },
    },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 ```
 #### Using Precomputed Embeddings
 If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by
 including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you
 want to customize the ingestion process.
 ```python
 chunks_with_embeddings = [
    {
        "content": "First chunk of text",
        "mime_type": "text/plain",
        "embedding": [0.1, 0.2, 0.3, ...],  # Your precomputed embedding vector
        "metadata": {"document_id": "doc1", "section": "introduction"},
    },
    {
        "content": "Second chunk of text",
        "mime_type": "text/plain",
        "embedding": [0.2, 0.3, 0.4, ...],  # Your precomputed embedding vector
        "metadata": {"document_id": "doc1", "section": "methodology"},
    },
 ]
 client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
 ```
 When providing precomputed embeddings, ensure the embedding dimension matches the embedding_dimension specified when
 registering the vector database.
 ### Retrieval
 You can query the vector database to retrieve documents based on their embeddings.
 ```python
 # You can then query for these chunks
 chunks_response = client.vector_io.query(
    vector_db_id=vector_db_id, query="What do you know about..."
 )
 ```
 ### Using the RAG Tool
 A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
 and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
 [appendix](#more-ragdocument-examples).
 ```python
 from llama_stack_client import RAGDocument
 urls = ["memory_optimizations.rst", "chat.rst", "llama3.rst"]
 documents = [
    RAGDocument(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
 ]
 client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=512,
 )
 # Query documents
 results = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content="What do you know about...",
 )
 ```
 You can configure how the RAG tool adds metadata to the context if you find it useful for your application. Simply add:
 ```python
 # Query documents
 results = client.tool_runtime.rag_tool.query(
    vector_db_ids=[vector_db_id],
    content="What do you know about...",
    query_config={
        "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
    },
 )
 ```
 ### Building RAG-Enhanced Agents
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
 ```python
 from llama_stack_client import Agent
 # Create agent with memory
 agent = Agent(
    client,
    model="meta-llama/Llama-3.3-70B-Instruct",
    instructions="You are a helpful assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {
                "vector_db_ids": [vector_db_id],
                # Defaults
                "query_config": {
                    "chunk_size_in_tokens": 512,
                    "chunk_overlap_in_tokens": 0,
                    "chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
                },
            },
        }
    ],
 )
 session_id = agent.create_session("rag_session")
 # Ask questions about documents in the vector db, and the agent will query the db to answer the question.
 response = agent.create_turn(
    messages=[{"role": "user", "content": "How to optimize memory in PyTorch?"}],
    session_id=session_id,
 )
 ```
 > **NOTE:** the `instructions` field in the `AgentConfig` can be used to guide the agent's behavior. It is important to experiment with different instructions to see what works best for your use case.
 You can also pass documents along with the user's message and ask questions about them.
 ```python
 # Initial document ingestion
 response = agent.create_turn(
    messages=[
        {"role": "user", "content": "I am providing some documents for reference."}
    ],
    documents=[
        {
            "content": "https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/memory_optimizations.rst",
            "mime_type": "text/plain",
        }
    ],
    session_id=session_id,
 )
 # Query with RAG
 response = agent.create_turn(
    messages=[{"role": "user", "content": "What are the key topics in the documents?"}],
    session_id=session_id,
 )
 ```
 You can print the response with below.
 ```python
 from llama_stack_client import AgentEventLogger
 for log in AgentEventLogger().log(response):
    log.print()
 ```
 ### Unregistering Vector DBs
 If you need to clean up and unregister vector databases, you can do so as follows:
 ```python
 # Unregister a specified vector database
 vector_db_id = "my_vector_db_id"
 print(f"Unregistering vector database: {vector_db_id}")
 client.vector_dbs.unregister(vector_db_id=vector_db_id)
 # Unregister all vector databases
 for vector_db_id in client.vector_dbs.list():
    print(f"Unregistering vector database: {vector_db_id.identifier}")
    client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
 ```
 ### Appendix
 #### More RAGDocument Examples
 ```python
 from llama_stack_client import RAGDocument
 import base64
 RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
 RAGDocument(document_id="num-1", content="plain text")
 RAGDocument(
    document_id="num-2",
    content={
        "type": "text",
        "text": "plain text input",
    },  # for inputs that should be treated as text explicitly
 )
 RAGDocument(
    document_id="num-3",
    content={
        "type": "image",
        "image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
    },
 )
 B64_ENCODED_IMAGE = base64.b64encode(
    requests.get(
        "https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
    ).content
 )
 RAGDocuemnt(
    document_id="num-4",
    content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
 )
 ```
 for more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
--- a/docs/source/building_applications/rag.png
+++ b/docs/source/building_applications/rag.png
--- a/docs/source/building_applications/safety.md
+++ b/docs/source/building_applications/safety.md
@ -1,17 +0,0 @@
 ## Safety Guardrails
 Safety is a critical component of any AI application. Llama Stack provides a Shield system that can be applied at multiple touchpoints:
 ```python
 # Register a safety shield
 shield_id = "content_safety"
 client.shields.register(shield_id=shield_id, provider_shield_id="llama-guard-basic")
 # Run content through shield
 response = client.safety.run_shield(
    shield_id=shield_id, messages=[{"role": "user", "content": "User message here"}]
 )
 if response.violation:
    print(f"Safety violation detected: {response.violation.user_message}")
 ```
--- a/docs/source/building_applications/telemetry.md
+++ b/docs/source/building_applications/telemetry.md
@ -1,71 +0,0 @@
 ## Telemetry
 The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output.
 ### Events
 The telemetry system supports three main types of events:
 - **Unstructured Log Events**: Free-form log messages with severity levels
 ```python
 unstructured_log_event = UnstructuredLogEvent(
    message="This is a log message", severity=LogSeverity.INFO
 )
 ```
 - **Metric Events**: Numerical measurements with units
 ```python
 metric_event = MetricEvent(metric="my_metric", value=10, unit="count")
 ```
 - **Structured Log Events**: System events like span start/end. Extensible to add more structured log types.
 ```python
 structured_log_event = SpanStartPayload(name="my_span", parent_span_id="parent_span_id")
 ```
 ### Spans and Traces
 - **Spans**: Represent operations with timing and hierarchical relationships
 - **Traces**: Collection of related spans forming a complete request flow
 ### Sinks
 - **OpenTelemetry**: Send events to an OpenTelemetry Collector. This is useful for visualizing traces in a tool like Jaeger.
 - **SQLite**: Store events in a local SQLite database. This is needed if you want to query the events later through the Llama Stack API.
 - **Console**: Print events to the console.
 ### Providers
 #### Meta-Reference Provider
 Currently, only the meta-reference provider is implemented. It can be configured to send events to three sink types:
 1) OpenTelemetry Collector
 2) SQLite
 3) Console
 #### Configuration
 Here's an example that sends telemetry signals to all three sink types. Your configuration might use only one.
 ```yaml
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
      otel_trace_endpoint: "http://localhost:4318/v1/traces"
      otel_metric_endpoint: "http://localhost:4318/v1/metrics"
      sqlite_db_path: "/path/to/telemetry.db"
 ```
 ### Jaeger to visualize traces
 The `otel` sink works with any service compatible with the OpenTelemetry collector, traces and metrics has two separate endpoints.
 Let's use Jaeger to visualize this data.
 Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:
 ```bash
 $ docker run --pull always --rm --name jaeger \
  -p 16686:16686 -p 4318:4318 \
  jaegertracing/jaeger:2.1.0
 ```
 Once the Jaeger instance is running, you can visualize traces by navigating to http://localhost:16686/.
 ### Querying Traces Stored in SQLite
 The `sqlite` sink allows you to query traces without an external system. Here are some example queries. Refer to the notebook at [Llama Stack Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on how to query traces and spaces.
--- a/docs/source/building_applications/tools.md
+++ b/docs/source/building_applications/tools.md
@ -1,262 +0,0 @@
 # Tools
 Tools are functions that can be invoked by an agent to perform tasks. They are organized into tool groups and registered with specific providers. Each tool group represents a collection of related tools from a single provider. They are organized into groups so that state can be externalized: the collection operates on the same state typically.
 An example of this would be a "db_access" tool group that contains tools for interacting with a database. "list_tables", "query_table", "insert_row" could be examples of tools in this group.
 Tools are treated as any other resource in llama stack like models. You can register them, have providers for them etc.
 When instantiating an agent, you can provide it a list of tool groups that it has access to. Agent gets the corresponding tool definitions for the specified tool groups and passes them along to the model.
 Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools.
 ## Server-side vs. client-side tool execution
 Llama Stack allows you to use both server-side and client-side tools. With server-side tools, `agent.create_turn` can perform execution of the tool calls emitted by the model
 transparently giving the user the final answer desired. If client-side tools are provided, the tool call is sent back to the user for execution
 and optional continuation using the `agent.resume_turn` method.
 ### Server-side tools
 Llama Stack provides built-in providers for some common tools. These include web search, math, and RAG capabilities.
 #### Web Search
 You have three providers to execute the web search tool calls generated by a model: Brave Search, Bing Search, and Tavily Search.
 To indicate that the web search tool calls should be executed by brave-search, you can point the "builtin::websearch" toolgroup to the "brave-search" provider.
 ```python
 client.toolgroups.register(
    toolgroup_id="builtin::websearch",
    provider_id="brave-search",
    args={"max_results": 5},
 )
 ```
 The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is:
 ```
 {"<provider_name>_api_key": <your api key>}
 ```
 #### Math
 The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.
 ```python
 client.toolgroups.register(
    toolgroup_id="builtin::wolfram_alpha", provider_id="wolfram-alpha"
 )
 ```
 Example usage:
 ```python
 result = client.tool_runtime.invoke_tool(
    tool_name="wolfram_alpha", args={"query": "solve x^2 + 2x + 1 = 0"}
 )
 ```
 #### RAG
 The RAG tool enables retrieval of context from various types of memory banks (vector, key-value, keyword, and graph).
 ```python
 # Register Memory tool group
 client.toolgroups.register(
    toolgroup_id="builtin::rag",
    provider_id="faiss",
    args={"max_chunks": 5, "max_tokens_in_context": 4096},
 )
 ```
 Features:
 - Support for multiple memory bank types
 - Configurable query generation
 - Context retrieval with token limits
 > **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
 ## Model Context Protocol (MCP)
 [MCP](https://github.com/modelcontextprotocol) is an upcoming, popular standard for tool discovery and execution. It is a protocol that allows tools to be dynamically discovered
 from an MCP endpoint and can be used to extend the agent's capabilities.
 ### Using Remote MCP Servers
 You can find some popular remote MCP servers [here](https://github.com/jaw9c/awesome-remote-mcp-servers). You can register them as toolgroups in the same way as local providers.
 ```python
 client.toolgroups.register(
    toolgroup_id="mcp::deepwiki",
    provider_id="model-context-protocol",
    mcp_endpoint=URL(uri="https://mcp.deepwiki.com/sse"),
 )
 ```
 Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server
 using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,
 ```python
 agent = Agent(
    ...,
    tools=["mcp::deepwiki"],
    extra_headers={
        "X-LlamaStack-Provider-Data": json.dumps(
            {
                "mcp_headers": {
                    "http://mcp.deepwiki.com/sse": {
                        "Authorization": "Bearer <your_access_token>",
                    },
                },
            }
        ),
    },
 )
 agent.create_turn(...)
 ```
 ### Running your own MCP server
 Here's an example of how to run a simple MCP server that exposes a File System as a set of tools to the Llama Stack agent.
 ```shell
 # start your MCP server
 mkdir /tmp/content
 touch /tmp/content/foo
 touch /tmp/content/bar
 npx -y supergateway --port 8000 --stdio 'npx -y @modelcontextprotocol/server-filesystem /tmp/content'
 ```
 Then register the MCP server as a tool group,
 ```python
 client.toolgroups.register(
    toolgroup_id="mcp::filesystem",
    provider_id="model-context-protocol",
    mcp_endpoint=URL(uri="http://localhost:8000/sse"),
 )
 ```
 ## Adding Custom (Client-side) Tools
 When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed
 along to the generative model.
 ```python
 # Example tool definition
 def my_tool(input: int) -> int:
    """
    Runs my awesome tool.
    :param input: some int parameter
    """
    return input * 2
 ```
 > **NOTE:** We employ python docstrings to describe the tool and the parameters. It is important to document the tool and the parameters so that the model can use the tool correctly. It is recommended to experiment with different docstrings to see how they affect the model's behavior.
 Once defined, simply pass the tool to the agent config. `Agent` will take care of the rest (calling the model with the tool definition, executing the tool, and returning the result to the model for the next iteration).
 ```python
 # Example agent config with client provided tools
 agent = Agent(client, ..., tools=[my_tool])
 ```
 Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/e2e_loop_with_client_tools.py) for an example of how to use client provided tools.
 ## Tool Invocation
 Tools can be invoked using the `invoke_tool` method:
 ```python
 result = client.tool_runtime.invoke_tool(
    tool_name="web_search", kwargs={"query": "What is the capital of France?"}
 )
 ```
 The result contains:
 - `content`: The tool's output
 - `error_message`: Optional error message if the tool failed
 - `error_code`: Optional error code if the tool failed
 ## Listing Available Tools
 You can list all available tools or filter by tool group:
 ```python
 # List all tools
 all_tools = client.tools.list_tools()
 # List tools in a specific group
 group_tools = client.tools.list_tools(toolgroup_id="search_tools")
 ```
 ## Simple Example 2: Using an Agent with the Web Search Tool
 1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
 2. [Optional] Provide the API key directly to the Llama Stack server
 ```bash
 export TAVILY_SEARCH_API_KEY="your key"
 ```
 ```bash
 --env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
 ```
 3. Run the following script.
 ```python
 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.types.agent_create_params import AgentConfig
 from llama_stack_client.lib.agents.event_logger import EventLogger
 from llama_stack_client import LlamaStackClient
 client = LlamaStackClient(
    base_url=f"http://localhost:8321",
    provider_data={
        "tavily_search_api_key": "your_TAVILY_SEARCH_API_KEY"
    },  # Set this from the client side. No need to provide it if it has already been configured on the Llama Stack server.
 )
 agent = Agent(
    client,
    model="meta-llama/Llama-3.2-3B-Instruct",
    instructions=(
        "You are a web search assistant, must use websearch tool to look up the most current and precise information available. "
    ),
    tools=["builtin::websearch"],
 )
 session_id = agent.create_session("websearch-session")
 response = agent.create_turn(
    messages=[
        {"role": "user", "content": "How did the USA perform in the last Olympics?"}
    ],
    session_id=session_id,
 )
 for log in EventLogger().log(response):
    log.print()
 ```
 ## Simple Example3: Using an Agent with the WolframAlpha Tool
 1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
 2. Provide the API key either when starting the Llama Stack server:
    ```bash
    --env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
    ```
    or from the client side:
    ```python
    client = LlamaStackClient(
        base_url="http://localhost:8321",
        provider_data={"wolfram_alpha_api_key": wolfram_api_key},
    )
    ```
 3. Configure the tools in the Agent by setting `tools=["builtin::wolfram_alpha"]`.
 4. Example user query:
    ```python
    response = agent.create_turn(
        messages=[{"role": "user", "content": "Solve x^2 + 2x + 1 = 0 using WolframAlpha"}],
        session_id=session_id,
    )
    ```
 ```
--- a/docs/source/concepts/api_providers.md
+++ b/docs/source/concepts/api_providers.md
@ -1,12 +0,0 @@
 ## API Providers
 The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
 - LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
 - Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
 - Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)
 Providers come in two flavors:
 - **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code.
 - **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack.
 Most importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.
--- a/docs/source/concepts/apis.md
+++ b/docs/source/concepts/apis.md
@ -1,18 +0,0 @@
 ## APIs
 A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:
 - **Inference**: run inference with a LLM
 - **Safety**: apply safety policies to the output at a Systems (not only model) level
 - **Agents**: run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc.
 - **DatasetIO**: interface with datasets and data loaders
 - **Scoring**: evaluate outputs of the system
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring
 - **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
 - **Telemetry**: collect telemetry data from the system
 We are working on adding a few more APIs to complete the application lifecycle. These will include:
 - **Batch Inference**: run inference on a dataset of inputs
 - **Batch Agents**: run agents on a dataset of inputs
 - **Post Training**: fine-tune a Llama model
 - **Synthetic Data Generation**: generate synthetic data for model development
--- a/docs/source/concepts/distributions.md
+++ b/docs/source/concepts/distributions.md
@ -1,9 +0,0 @@
 ## Distributions
 While there is a lot of flexibility to mix-and-match providers, often users will work with a specific set of providers (hardware support, contractual obligations, etc.) We therefore need to provide a _convenient shorthand_ for such collections. We call this shorthand a **Llama Stack Distribution** or a **Distro**. One can think of it as specific pre-packaged versions of the Llama Stack. Here are some examples:
 **Remotely Hosted Distro**: These are the simplest to consume from a user perspective. You can simply obtain the API key for these providers, point to a URL and have _all_ Llama Stack APIs working out of the box. Currently, [Fireworks](https://fireworks.ai/) and [Together](https://together.xyz/) provide such easy-to-consume Llama Stack distributions.
 **Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.
 **On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/ios_sdk.html) and [Android](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/android_sdk.html)
--- a/docs/source/concepts/evaluation_concepts.md
+++ b/docs/source/concepts/evaluation_concepts.md
@ -1,77 +0,0 @@
 ## Evaluation Concepts
 The Llama Stack Evaluation flow allows you to run evaluations on your GenAI application datasets or pre-registered benchmarks.
 We introduce a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
 - `/datasetio` + `/datasets` API
 - `/scoring` + `/scoring_functions` API
 - `/eval` + `/benchmarks` API
 This guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for different use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
 The Evaluation APIs are associated with a set of Resources. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding.
 - **DatasetIO**: defines interface with datasets and data loaders.
  - Associated with `Dataset` resource.
 - **Scoring**: evaluate outputs of the system.
  - Associated with `ScoringFunction` resource. We provide a suite of out-of-the box scoring functions and also the ability for you to add custom evaluators. These scoring functions are the core part of defining an evaluation task to output evaluation metrics.
 - **Eval**: generate outputs (via Inference or Agents) and perform scoring.
  - Associated with `Benchmark` resource.
 ### Open-benchmark Eval
 #### List of open-benchmarks Llama Stack support
 Llama stack pre-registers several popular open-benchmarks to easily evaluate model perfomance via CLI.
 The list of open-benchmarks we currently support:
 - [MMLU-COT](https://arxiv.org/abs/2009.03300) (Measuring Massive Multitask Language Understanding): Benchmark designed to comprehensively evaluate the breadth and depth of a model's academic and professional understanding
 - [GPQA-COT](https://arxiv.org/abs/2311.12022) (A Graduate-Level Google-Proof Q&A Benchmark): A challenging benchmark of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry.
 - [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
 - [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI)]: Benchmark designed to evaluate multimodal models.
 You can follow this [contributing guide](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
 #### Run evaluation on open-benchmarks via CLI
 We have built-in functionality to run the supported open-benckmarks using llama-stack-client CLI
 #### Spin up Llama Stack server
 Spin up llama stack server with 'open-benchmark' template
 ```
 llama stack run llama_stack/templates/open-benchmark/run.yaml
 ```
 #### Run eval CLI
 There are 3 necessary inputs to run a benchmark eval
 - `list of benchmark_ids`: The list of benchmark ids to run evaluation on
 - `model-id`: The model id to evaluate on
 - `output_dir`: Path to store the evaluate results
 ```
 llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
 --model_id <model id to evaluate on> \
 --output_dir <directory to store the evaluate results> \
 ```
 You can run
 ```
 llama-stack-client eval run-benchmark help
 ```
 to see the description of all the flags that eval run-benchmark has
 In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggregate
 evaluation results over there.
 #### What's Next?
 - Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
 - Check out our [Building Applications - Evaluation](../building_applications/evals.md) guide for more details on how to use the Evaluation APIs to evaluate your applications.
 - Check out our [Evaluation Reference](../references/evals_reference/index.md) for more details on the APIs.
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -1,23 +0,0 @@
 # Core Concepts
 Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.
 ```{include} apis.md
 :start-after: ## APIs
 ```
 ```{include} api_providers.md
 :start-after: ## API Providers
 ```
 ```{include} resources.md
 :start-after: ## Resources
 ```
 ```{include} distributions.md
 :start-after: ## Distributions
 ```
 ```{include} evaluation_concepts.md
 :start-after: ## Evaluation Concepts
 ```
--- a/docs/source/concepts/resources.md
+++ b/docs/source/concepts/resources.md
@ -1,19 +0,0 @@
 ## Resources
 Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources:
 - **Inference**, **Eval** and **Post Training** are associated with `Model` resources.
 - **Safety** is associated with `Shield` resources.
 - **Tool Runtime** is associated with `ToolGroup` resources.
 - **DatasetIO** is associated with `Dataset` resources.
 - **VectorIO** is associated with `VectorDB` resources.
 - **Scoring** is associated with `ScoringFunction` resources.
 - **Eval** is associated with `Model` and `Benchmark` resources.
 Furthermore, we allow these resources to be **federated** across multiple providers. For example, you may have some Llama models served by Fireworks while others are served by AWS Bedrock. Regardless, they will all work seamlessly with the same uniform Inference API provided by Llama Stack.
 ```{admonition} Registering Resources
 :class: tip
 Given this architecture, it is necessary for the Stack to know which provider to use for a given resource. This means you need to explicitly _register_ resources (including models) before you can use them with the associated APIs.
 ```
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -1,143 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 # Configuration file for the Sphinx documentation builder.
 #
 # For the full list of built-in configuration values, see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 import json
 from datetime import datetime
 from pathlib import Path
 import requests
 from docutils import nodes
 # Read version from pyproject.toml
 with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
    pypi_url = "https://pypi.org/pypi/llama-stack/json"
    headers = {
        'User-Agent': 'pip/23.0.1 (python 3.11)',  # Mimic pip's user agent
        'Accept': 'application/json'
    }
    version_tag = json.loads(requests.get(pypi_url, headers=headers).text)["info"]["version"]
    print(f"{version_tag=}")
    # generate the full link including text and url here
    llama_stack_version_url = (
        f"https://github.com/meta-llama/llama-stack/releases/tag/v{version_tag}"
    )
    llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"
 project = "llama-stack"
 copyright = f"{datetime.now().year}, Meta"
 author = "Meta"
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 extensions = [
    "myst_parser",
    "sphinx_copybutton",
    "sphinx_design",
    "sphinx_rtd_theme",
    "sphinx_rtd_dark_mode",
    "sphinx_tabs.tabs",
    "sphinxcontrib.redoc",
    "sphinxcontrib.mermaid",
    "sphinxcontrib.video",
 ]
 myst_enable_extensions = ["colon_fence"]
 html_theme = "sphinx_rtd_theme"
 html_use_relative_paths = True
 templates_path = ["_templates"]
 exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 myst_enable_extensions = [
    "amsmath",
    "attrs_inline",
    "attrs_block",
    "colon_fence",
    "deflist",
    "dollarmath",
    "fieldlist",
    "html_admonition",
    "html_image",
    # "linkify",
    "replacements",
    "smartquotes",
    "strikethrough",
    "substitution",
    "tasklist",
 ]
 myst_substitutions = {
    "docker_hub": "https://hub.docker.com/repository/docker/llamastack",
    "llama_stack_version": version_tag,
    "llama_stack_version_link": llama_stack_version_link,
 }
 suppress_warnings = ["myst.header"]
 # Copy button settings
 copybutton_prompt_text = "$ "  # for bash prompts
 copybutton_prompt_is_regexp = True
 copybutton_remove_prompts = True
 copybutton_line_continuation_character = "\\"
 # Source suffix
 source_suffix = {
    ".rst": "restructuredtext",
    ".md": "markdown",
 }
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 # html_theme = "alabaster"
 html_theme_options = {
    "canonical_url": "https://github.com/meta-llama/llama-stack",
    "collapse_navigation": False,
    # "style_nav_header_background": "#c3c9d4",
    'display_version': True,
    'version_selector': True,
 }
 default_dark_mode = False
 html_static_path = ["../_static"]
 # html_logo = "../_static/llama-stack-logo.png"
 # html_style = "../_static/css/my_theme.css"
 def setup(app):
    app.add_css_file("css/my_theme.css")
    app.add_js_file("js/detect_theme.js")
    def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
        url = f"https://hub.docker.com/r/llamastack/{text}"
        node = nodes.reference(rawtext, text, refuri=url, **options)
        return [node], []
    def repopath_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
        parts = text.split("::")
        if len(parts) == 2:
            link_text = parts[0]
            url_path = parts[1]
        else:
            link_text = text
            url_path = text
        url = f"https://github.com/meta-llama/llama-stack/tree/main/{url_path}"
        node = nodes.reference(rawtext, link_text, refuri=url, **options)
        return [node], []
    app.add_role("dockerhub", dockerhub_role)
    app.add_role("repopath", repopath_role)
--- a/docs/source/contributing/index.md
+++ b/docs/source/contributing/index.md
@ -1,14 +0,0 @@
 ```{include} ../../../CONTRIBUTING.md
 ```
 See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
 ```{toctree}
 :maxdepth: 1
 :hidden:
 new_api_provider
 ```
--- a/docs/source/contributing/new_api_provider.md
+++ b/docs/source/contributing/new_api_provider.md
@ -1,48 +0,0 @@
 # Adding a New API Provider
 This guide will walk you through the process of adding a new API provider to Llama Stack.
 - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
 - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
 - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
 - Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
 Here are some example PRs to help you get started:
   - [Grok Inference Implementation](https://github.com/meta-llama/llama-stack/pull/609)
   - [Nvidia Inference Implementation](https://github.com/meta-llama/llama-stack/pull/355)
   - [Model context protocol Tool Runtime](https://github.com/meta-llama/llama-stack/pull/665)
 ## Testing the Provider
 Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --template together`.
 ### 1. Integration Testing
 Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality. Since these tests use client APIs, they can be run either by pointing to an instance of the Llama Stack server or "inline" by using `LlamaStackAsLibraryClient`.
 Consult {repopath}`tests/integration/README.md` for more details on how to run the tests.
 Note that each provider's `sample_run_config()` method (in the configuration class for that provider)
 typically references some environment variables for specifying API keys and the like. You can set these in the environment or pass these via the `--env` flag to the test command.
 ### 2. Unit Testing
 Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
 Consult {repopath}`tests/unit/README.md` for more details on how to run the tests manually.
 ### 3. Additional end-to-end testing
 1. Start a Llama Stack server with your new provider
 2. Verify compatibility with existing client scripts in the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repository
 3. Document which scripts are compatible with your provider
 ## Submitting Your PR
 1. Ensure all tests pass
 2. Include a comprehensive test plan in your PR summary
 3. Document any known limitations or considerations
--- a/docs/source/contributing/testing.md
+++ b/docs/source/contributing/testing.md
@ -1,6 +0,0 @@
 # Testing Llama Stack
 Tests are of three different kinds:
 - Unit tests
 - Provider focused integration tests
 - Client SDK tests
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -1,420 +0,0 @@
 # Build your own Distribution
 This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.
 ### Setting your log level
 In order to specify the proper logging level users can apply the following environment variable `LLAMA_STACK_LOGGING` with the following format:
 `LLAMA_STACK_LOGGING=server=debug;core=info`
 Where each category in the following list:
 - all
 - core
 - server
 - router
 - inference
 - agents
 - safety
 - eval
 - tools
 - client
 Can be set to any of the following log levels:
 - debug
 - info
 - warning
 - error
 - critical
 The default global log level is `info`. `all` sets the log level for all components.
 A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log`
 ### Llama Stack Build
 In order to build your own distribution, we recommend you clone the `llama-stack` repository.
 ```
 git clone git@github.com:meta-llama/llama-stack.git
 cd llama-stack
 pip install -e .
 ```
 Use the CLI to build your distribution.
 The main points to consider are:
 1. **Image Type** - Do you want a Conda / venv environment or a Container (eg. Docker)
 2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
 3. **Config** - Do you want to use a pre-existing config file to build your distribution?
 ```
 llama stack build -h
 usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]
 Build a Llama stack container
 options:
  -h, --help            show this help message and exit
  --config CONFIG       Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will
                        be prompted to enter information interactively (default: None)
  --template TEMPLATE   Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
  --list-templates      Show the available templates for building a Llama Stack distribution (default: False)
  --image-type {conda,container,venv}
                        Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
                        conda)
  --image-name IMAGE_NAME
                        [for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
                        found. (default: None)
  --print-deps-only     Print the dependencies for the stack only, without building the stack (default: False)
  --run                 Run the stack after building using the same image type, name, and other applicable arguments (default: False)
 ```
 After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
 ::::{tab-set}
 :::{tab-item} Building from a template
 To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
 The following command will allow you to see the available templates and their corresponding providers.
 ```
 llama stack build --list-templates
 ```
 ```
 ------------------------------+-----------------------------------------------------------------------------+
 | Template Name                | Description                                                                 |
 +------------------------------+-----------------------------------------------------------------------------+
 | hf-serverless                | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
 +------------------------------+-----------------------------------------------------------------------------+
 | together                     | Use Together.AI for running LLM inference                                   |
 +------------------------------+-----------------------------------------------------------------------------+
 | vllm-gpu                     | Use a built-in vLLM engine for running LLM inference                        |
 +------------------------------+-----------------------------------------------------------------------------+
 | experimental-post-training   | Experimental template for post training                                     |
 +------------------------------+-----------------------------------------------------------------------------+
 | remote-vllm                  | Use (an external) vLLM server for running LLM inference                     |
 +------------------------------+-----------------------------------------------------------------------------+
 | fireworks                    | Use Fireworks.AI for running LLM inference                                  |
 +------------------------------+-----------------------------------------------------------------------------+
 | tgi                          | Use (an external) TGI server for running LLM inference                      |
 +------------------------------+-----------------------------------------------------------------------------+
 | bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
 +------------------------------+-----------------------------------------------------------------------------+
 | meta-reference-gpu           | Use Meta Reference for running LLM inference                                |
 +------------------------------+-----------------------------------------------------------------------------+
 | nvidia                       | Use NVIDIA NIM for running LLM inference                                    |
 +------------------------------+-----------------------------------------------------------------------------+
 | cerebras                     | Use Cerebras for running LLM inference                                      |
 +------------------------------+-----------------------------------------------------------------------------+
 | ollama                       | Use (an external) Ollama server for running LLM inference                   |
 +------------------------------+-----------------------------------------------------------------------------+
 | hf-endpoint                  | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
 +------------------------------+-----------------------------------------------------------------------------+
 ```
 You may then pick a template to build your distribution with providers fitted to your liking.
 For example, to build a distribution with TGI as the inference provider, you can run:
 ```
 $ llama stack build --template tgi
 ...
 You can now edit ~/.llama/distributions/llamastack-tgi/tgi-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml`
 ```
 :::
 :::{tab-item} Building from Scratch
 If the provided templates do not fit your use case, you could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
 It would be best to start with a template and understand the structure of the config file and the various concepts ( APIS, providers, resources, etc.) before starting from scratch.
 ```
 llama stack build
 > Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
 > Enter the image type you want your Llama Stack to be built as (container or conda or venv): conda
 Llama Stack is composed of several APIs working together. Let's select
 the provider types (implementations) you want to use for these APIs.
 Tip: use <TAB> to see options for the providers.
 > Enter provider for API inference: inline::meta-reference
 > Enter provider for API safety: inline::llama-guard
 > Enter provider for API agents: inline::meta-reference
 > Enter provider for API memory: inline::faiss
 > Enter provider for API datasetio: inline::meta-reference
 > Enter provider for API scoring: inline::meta-reference
 > Enter provider for API eval: inline::meta-reference
 > Enter provider for API telemetry: inline::meta-reference
 > (Optional) Enter a short description for your Llama Stack:
 You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml`
 ```
 :::
 :::{tab-item} Building from a pre-existing build config file
 - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
 - The config file will be of contents like the ones in `llama_stack/templates/*build.yaml`.
 ```
 $ cat llama_stack/templates/ollama/build.yaml
 name: ollama
 distribution_spec:
  description: Like local, but use ollama for running LLM inference
  providers:
    inference: remote::ollama
    memory: inline::faiss
    safety: inline::llama-guard
    agents: inline::meta-reference
    telemetry: inline::meta-reference
 image_name: ollama
 image_type: conda
 # If some providers are external, you can specify the path to the implementation
 external_providers_dir: ~/.llama/providers.d
 ```
 ```
 llama stack build --config llama_stack/templates/ollama/build.yaml
 ```
 :::
 :::{tab-item} Building with External Providers
 Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
 To build a distribution with external providers, you need to:
 1. Configure the `external_providers_dir` in your build configuration file:
 ```yaml
 # Example my-external-stack.yaml with external providers
 version: '2'
 distribution_spec:
  description: Custom distro for CI tests
  providers:
    inference:
    - remote::custom_ollama
 # Add more providers as needed
 image_type: container
 image_name: ci-test
 # Path to external provider implementations
 external_providers_dir: ~/.llama/providers.d
 ```
 Here's an example for a custom Ollama provider:
 ```yaml
 adapter:
  adapter_type: custom_ollama
  pip_packages:
  - ollama
  - aiohttp
  - llama-stack-provider-ollama # This is the provider package
  config_class: llama_stack_ollama_provider.config.OllamaImplConfig
  module: llama_stack_ollama_provider
 api_dependencies: []
 optional_api_dependencies: []
 ```
 The `pip_packages` section lists the Python packages required by the provider, as well as the
 provider package itself. The package must be available on PyPI or can be provided from a local
 directory or a git repository (git must be installed on the build environment).
 2. Build your distribution using the config file:
 ```
 llama stack build --config my-external-stack.yaml
 ```
 For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external.md).
 :::
 :::{tab-item} Building Container
 ```{admonition} Podman Alternative
 :class: tip
 Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
 ```
 To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
 ```
 llama stack build --template ollama --image-type container
 ```
 ```
 $ llama stack build --template ollama --image-type container
 ...
 Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
 ...
 You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
 ```
 Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.
 ```
 export INFERENCE_MODEL="llama3.2:3b"
 export LLAMA_STACK_PORT=8321
 mkdir -p ~/.llama
 ```
 After this step is successful, you should be able to find the built container image and test it with the below Docker command:
 ```
 docker run -d \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  localhost/distribution-ollama:dev \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
 ```
 Here are the docker flags and their uses:
 * `-d`: Runs the container in the detached mode as a background process
 * `-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT`: Maps the container port to the host port for accessing the server
 * `-v ~/.llama:/root/.llama`: Mounts the local .llama directory to persist configurations and data
 * `localhost/distribution-ollama:dev`: The name and tag of the container image to run
 * `--port $LLAMA_STACK_PORT`: Port number for the server to listen on
 * `--env INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the model to use for inference
 * `--env OLLAMA_URL=http://host.docker.internal:11434`: Configures the URL for the Ollama service
 :::
 ::::
 ### Running your Stack server
 Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.
 ```
 llama stack run -h
 usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
                       [--image-type {conda,container,venv}]
                       config
 Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
 positional arguments:
  config                Path to config file to use for the run
 options:
  -h, --help            show this help message and exit
  --port PORT           Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
  --image-name IMAGE_NAME
                        Name of the image to run. Defaults to the current environment (default: None)
  --env KEY=VALUE       Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
  --tls-keyfile TLS_KEYFILE
                        Path to TLS key file for HTTPS (default: None)
  --tls-certfile TLS_CERTFILE
                        Path to TLS certificate file for HTTPS (default: None)
  --image-type {conda,container,venv}
                        Image Type used during the build. This can be either conda or container or venv. (default: conda)
 ```
 ```
 # Start using template name
 llama stack run tgi
 # Start using config file
 llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 # Start using a venv
 llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 # Start using a conda environment
 llama stack run --image-type conda ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 ```
 ```
 $ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
 Serving API inspect
 GET /health
 GET /providers/list
 GET /routes/list
 Serving API inference
 POST /inference/chat_completion
 POST /inference/completion
 POST /inference/embeddings
 ...
 Serving API agents
 POST /agents/create
 POST /agents/session/create
 POST /agents/turn/create
 POST /agents/delete
 POST /agents/session/delete
 POST /agents/session/get
 POST /agents/step/get
 POST /agents/turn/get
 Listening on ['::', '0.0.0.0']:8321
 INFO:     Started server process [2935911]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
 INFO:     2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
 ```
 ### Listing Distributions
 Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
 ```
 llama stack list -h
 usage: llama stack list [-h]
 list the build stacks
 options:
  -h, --help  show this help message and exit
 ```
 Example Usage
 ```
 llama stack list
 ```
 ### Removing a Distribution
 Use the remove command to delete a distribution you've previously built.
 ```
 llama stack rm -h
 usage: llama stack rm [-h] [--all] [name]
 Remove the build stack
 positional arguments:
  name        Name of the stack to delete (default: None)
 options:
  -h, --help  show this help message and exit
  --all, -a   Delete all stacks (use with caution) (default: False)
 ```
 Example
 ```
 llama stack rm llamastack-test
 ```
 To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they’re no longer needed.
 ### Troubleshooting
 If you encounter any issues, ask questions in our discord or search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
--- a/docs/source/distributions/configuration.md
+++ b/docs/source/distributions/configuration.md
@ -1,401 +0,0 @@
 # Configuring a "Stack"
 The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
 ```{dropdown} 👋 Click here for a Sample Configuration File
 ```yaml
 version: 2
 conda_env: ollama
 apis:
 - agents
 - inference
 - vector_io
 - safety
 - telemetry
 providers:
  inference:
  - provider_id: ollama
    provider_type: remote::ollama
    config:
      url: ${env.OLLAMA_URL:http://localhost:11434}
  vector_io:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  agents:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db
  telemetry:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: ollama
  provider_model_id: null
 shields: []
 server:
  port: 8321
  auth:
    provider_type: "kubernetes"
    config:
      api_server_url: "https://kubernetes.default.svc"
      ca_cert_path: "/path/to/ca.crt"
 ```
 Let's break this down into the different sections. The first section specifies the set of APIs that the stack server will serve:
 ```yaml
 apis:
 - agents
 - inference
 - memory
 - safety
 - telemetry
 ```
 ## Providers
 Next up is the most critical part: the set of providers that the stack will use to serve the above APIs. Consider the `inference` API:
 ```yaml
 providers:
  inference:
  # provider_id is a string you can choose freely
  - provider_id: ollama
    # provider_type is a string that specifies the type of provider.
    # in this case, the provider for inference is ollama and it is run remotely (outside of the distribution)
    provider_type: remote::ollama
    # config is a dictionary that contains the configuration for the provider.
    # in this case, the configuration is the url of the ollama server
    config:
      url: ${env.OLLAMA_URL:http://localhost:11434}
 ```
 A few things to note:
 - A _provider instance_ is identified with an (id, type, configuration) triplet.
 - The id is a string you can choose freely.
 - You can instantiate any number of provider instances of the same type.
 - The configuration dictionary is provider-specific.
 - Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.
 ## Resources
 Finally, let's look at the `models` section:
 ```yaml
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: ollama
  provider_model_id: null
 ```
 A Model is an instance of a "Resource" (see [Concepts](../concepts/index)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to always register models before using them, some Stack servers may come up a list of "already known and available" models.
 What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
 ## Server Configuration
 The `server` section configures the HTTP server that serves the Llama Stack APIs:
 ```yaml
 server:
  port: 8321  # Port to listen on (default: 8321)
  tls_certfile: "/path/to/cert.pem"  # Optional: Path to TLS certificate for HTTPS
  tls_keyfile: "/path/to/key.pem"    # Optional: Path to TLS key for HTTPS
 ```
 ### Authentication Configuration
 The `auth` section configures authentication for the server. When configured, all API requests must include a valid Bearer token in the Authorization header:
 ```
 Authorization: Bearer <token>
 ```
 The server supports multiple authentication providers:
 #### OAuth 2.0/OpenID Connect Provider with Kubernetes
 The Kubernetes cluster must be configured to use a service account for authentication.
 ```bash
 kubectl create namespace llama-stack
 kubectl create serviceaccount llama-stack-auth -n llama-stack
 kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
 kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
 ```
 Make sure the `kube-apiserver` runs with `--anonymous-auth=true` to allow unauthenticated requests
 and that the correct RoleBinding is created to allow the service account to access the necessary
 resources. If that is not the case, you can create a RoleBinding for the service account to access
 the necessary resources:
 ```yaml
 # allow-anonymous-openid.yaml
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: allow-anonymous-openid
 rules:
 - nonResourceURLs: ["/openid/v1/jwks"]
  verbs: ["get"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: allow-anonymous-openid
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: allow-anonymous-openid
 subjects:
 - kind: User
  name: system:anonymous
  apiGroup: rbac.authorization.k8s.io
 ```
 And then apply the configuration:
 ```bash
 kubectl apply -f allow-anonymous-openid.yaml
 ```
 Validates tokens against the Kubernetes API server through the OIDC provider:
 ```yaml
 server:
  auth:
    provider_type: "oauth2_token"
    config:
      jwks:
        uri: "https://kubernetes.default.svc"
        key_recheck_period: 3600
      tls_cafile: "/path/to/ca.crt"
      issuer: "https://kubernetes.default.svc"
      audience: "https://kubernetes.default.svc"
 ```
 To find your cluster's audience, run:
 ```bash
 kubectl create token default --duration=1h | cut -d. -f2 | base64 -d | jq .aud
 ```
 For the issuer, you can use the OIDC provider's URL:
 ```bash
 kubectl get --raw /.well-known/openid-configuration| jq .issuer
 ```
 For the tls_cafile, you can use the CA certificate of the OIDC provider:
 ```bash
 kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}'
 ```
 The provider extracts user information from the JWT token:
 - Username from the `sub` claim becomes a role
 - Kubernetes groups become teams
 You can easily validate a request by running:
 ```bash
 curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
 ```
 #### Custom Provider
 Validates tokens against a custom authentication endpoint:
 ```yaml
 server:
  auth:
    provider_type: "custom"
    config:
      endpoint: "https://auth.example.com/validate"  # URL of the auth endpoint
 ```
 The custom endpoint receives a POST request with:
 ```json
 {
  "api_key": "<token>",
  "request": {
    "path": "/api/v1/endpoint",
    "headers": {
      "content-type": "application/json",
      "user-agent": "curl/7.64.1"
    },
    "params": {
      "key": ["value"]
    }
  }
 }
 ```
 And must respond with:
 ```json
 {
  "access_attributes": {
    "roles": ["admin", "user"],
    "teams": ["ml-team", "nlp-team"],
    "projects": ["llama-3", "project-x"],
    "namespaces": ["research"]
  },
  "message": "Authentication successful"
 }
 ```
 If no access attributes are returned, the token is used as a namespace.
 ### Quota Configuration
 The `quota` section allows you to enable server-side request throttling for both
 authenticated and anonymous clients. This is useful for preventing abuse, enforcing
 fairness across tenants, and controlling infrastructure costs without requiring
 client-side rate limiting or external proxies.
 Quotas are disabled by default. When enabled, each client is tracked using either:
 * Their authenticated `client_id` (derived from the Bearer token), or
 * Their IP address (fallback for anonymous requests)
 Quota state is stored in a SQLite-backed key-value store, and rate limits are applied
 within a configurable time window (currently only `day` is supported).
 #### Example
 ```yaml
 server:
  quota:
    kvstore:
      type: sqlite
      db_path: ./quotas.db
    anonymous_max_requests: 100
    authenticated_max_requests: 1000
    period: day
 ```
 #### Configuration Options
 | Field                        | Description                                                                |
 | ---------------------------- | -------------------------------------------------------------------------- |
 | `kvstore`                    | Required. Backend storage config for tracking request counts.              |
 | `kvstore.type`               | Must be `"sqlite"` for now. Other backends may be supported in the future. |
 | `kvstore.db_path`            | File path to the SQLite database.                                          |
 | `anonymous_max_requests`     | Max requests per period for unauthenticated clients.                       |
 | `authenticated_max_requests` | Max requests per period for authenticated clients.                         |
 | `period`                     | Time window for quota enforcement. Only `"day"` is supported.              |
 > Note: if `authenticated_max_requests` is set but no authentication provider is
 configured, the server will fall back to applying `anonymous_max_requests` to all
 clients.
 #### Example with Authentication Enabled
 ```yaml
 server:
  port: 8321
  auth:
    provider_type: custom
    config:
      endpoint: https://auth.example.com/validate
  quota:
    kvstore:
      type: sqlite
      db_path: ./quotas.db
    anonymous_max_requests: 100
    authenticated_max_requests: 1000
    period: day
 ```
 If a client exceeds their limit, the server responds with:
 ```http
 HTTP/1.1 429 Too Many Requests
 Content-Type: application/json
 {
  "error": {
    "message": "Quota exceeded"
  }
 }
 ```
 ## Extending to handle Safety
 Configuring Safety can be a little involved so it is instructive to go through an example.
 The Safety API works with the associated Resource called a `Shield`. Providers can support various kinds of Shields. Good examples include the [Llama Guard](https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/) system-safety models, or [Bedrock Guardrails](https://aws.amazon.com/bedrock/guardrails/).
 To configure a Bedrock Shield, you would need to add:
 - A Safety API provider instance with type `remote::bedrock`
 - A Shield resource served by this provider.
 ```yaml
 ...
 providers:
  safety:
  - provider_id: bedrock
    provider_type: remote::bedrock
    config:
      aws_access_key_id: ${env.AWS_ACCESS_KEY_ID}
      aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY}
 ...
 shields:
 - provider_id: bedrock
  params:
    guardrailVersion: ${env.GUARDRAIL_VERSION}
  provider_shield_id: ${env.GUARDRAIL_ID}
 ...
 ```
 The situation is more involved if the Shield needs _Inference_ of an associated model. This is the case with Llama Guard. In that case, you would need to add:
 - A Safety API provider instance with type `inline::llama-guard`
 - An Inference API provider instance for serving the model.
 - A Model resource associated with this provider.
 - A Shield resource served by the Safety provider.
 The yaml configuration for this setup, assuming you were using vLLM as your inference server, would look like:
 ```yaml
 ...
 providers:
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  inference:
  # this vLLM server serves the "normal" inference model (e.g., llama3.2:3b)
  - provider_id: vllm-0
    provider_type: remote::vllm
    config:
      url: ${env.VLLM_URL:http://localhost:8000}
  # this vLLM server serves the llama-guard model (e.g., llama-guard:3b)
  - provider_id: vllm-1
    provider_type: remote::vllm
    config:
      url: ${env.SAFETY_VLLM_URL:http://localhost:8001}
 ...
 models:
 - metadata: {}
  model_id: ${env.INFERENCE_MODEL}
  provider_id: vllm-0
  provider_model_id: null
 - metadata: {}
  model_id: ${env.SAFETY_MODEL}
  provider_id: vllm-1
  provider_model_id: null
 shields:
 - provider_id: llama-guard
  shield_id: ${env.SAFETY_MODEL}   # Llama Guard shields are identified by the corresponding LlamaGuard model
  provider_shield_id: null
 ...
 ```
--- a/docs/source/distributions/importing_as_library.md
+++ b/docs/source/distributions/importing_as_library.md
@ -1,36 +0,0 @@
 # Using Llama Stack as a Library
 ## Setup Llama Stack without a Server
 If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library.
 This avoids the overhead of setting up a server.
 ```bash
 # setup
 uv pip install llama-stack
 llama stack build --template ollama --image-type venv
 ```
 ```python
 from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
 client = LlamaStackAsLibraryClient(
    "ollama",
    # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
    provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
 )
 client.initialize()
 ```
 This will parse your config and set up any inline implementations and remote clients needed for your implementation.
 Then, you can access the APIs like `models` and `inference` on the client and call their methods directly:
 ```python
 response = client.models.list()
 ```
 If you've created a [custom distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html), you can also use the run.yaml configuration file directly:
 ```python
 client = LlamaStackAsLibraryClient(config_path)
 client.initialize()
 ```
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@ -1,18 +0,0 @@
 # Distributions Overview
 A distribution is a pre-packaged set of Llama Stack components that can be deployed together.
 This section provides an overview of the distributions available in Llama Stack.
 ```{toctree}
 :maxdepth: 3
 importing_as_library
 configuration
 list_of_distributions
 kubernetes_deployment
 building_distro
 on_device_distro
 remote_hosted_distro
 self_hosted_distro
 ```
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -1,32 +0,0 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 export POSTGRES_USER=${POSTGRES_USER:-llamastack}
 export POSTGRES_DB=${POSTGRES_DB:-llamastack}
 export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack}
 export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
 export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
 set -euo pipefail
 set -x
 envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
 envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
 envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
 envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
 kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
  --dry-run=client -o yaml > stack-configmap.yaml
 kubectl apply -f stack-configmap.yaml
 envsubst < ./stack-k8s.yaml.template | kubectl apply -f -
 envsubst < ./ingress-k8s.yaml.template | kubectl apply -f -
 envsubst < ./ui-k8s.yaml.template | kubectl apply -f -
--- a/docs/source/distributions/k8s/chroma-k8s.yaml.template
+++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template
@ -1,66 +0,0 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: chromadb-pvc
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 20Gi
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: chromadb
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: chromadb
  template:
    metadata:
      labels:
        app: chromadb
    spec:
      containers:
      - name: chromadb
        image: chromadb/chroma:latest
        ports:
        - containerPort: 6000
        env:
        - name: CHROMA_HOST
          value: "0.0.0.0"
        - name: CHROMA_PORT
          value: "6000"
        - name: PERSIST_DIRECTORY
          value: "/chroma/chroma"
        - name: CHROMA_DB_IMPL
          value: "duckdb+parquet"
        resources:
          requests:
            memory: "512Mi"
            cpu: "250m"
          limits:
            memory: "2Gi"
            cpu: "1000m"
        volumeMounts:
        - name: chromadb-storage
          mountPath: /chroma/chroma
      volumes:
      - name: chromadb-storage
        persistentVolumeClaim:
          claimName: chromadb-pvc
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: chromadb
 spec:
  selector:
    app: chromadb
  ports:
  - protocol: TCP
    port: 6000
    targetPort: 6000
  type: ClusterIP
--- a/docs/source/distributions/k8s/ingress-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ingress-k8s.yaml.template
@ -1,17 +0,0 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: llama-stack-service
 spec:
  type: LoadBalancer
  selector:
    app.kubernetes.io/name: llama-stack
  ports:
    - name: llama-stack-api
      port: 8321
      targetPort: 8321
      protocol: TCP
    - name: llama-stack-ui
      port: 8322
      targetPort: 8322
      protocol: TCP
--- a/Show more
+++ b/Show more
		`@ -1,2 +0,0 @@`
			`# This file documents Triage members in the Llama Stack community`
			`@bbrowning @booxter @franciscojavierarceo @leseb`
		`@ -1 +0,0 @@`
			The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
		`@ -1 +0,0 @@`
			`This is forked from https://github.com/hunyadi/pyopenapi`