Merge branch 'main' into content-extension

This commit is contained in:
Francisco Arceo 2025-08-25 14:22:15 -06:00 committed by GitHub
commit 3e11e1472c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
334 changed files with 22841 additions and 8940 deletions

2
.github/TRIAGERS.md vendored
View file

@ -1,2 +1,2 @@
# This file documents Triage members in the Llama Stack community
@bbrowning @franciscojavierarceo @leseb
@franciscojavierarceo

View file

@ -2,9 +2,13 @@ name: 'Run and Record Tests'
description: 'Run integration tests and handle recording/artifact upload'
inputs:
test-types:
description: 'JSON array of test types to run'
test-subdirs:
description: 'Comma-separated list of test subdirectories to run'
required: true
test-pattern:
description: 'Regex pattern to pass to pytest -k'
required: false
default: ''
stack-config:
description: 'Stack configuration to use'
required: true
@ -32,12 +36,14 @@ runs:
- name: Run Integration Tests
shell: bash
run: |
./scripts/integration-tests.sh \
uv run --no-sync ./scripts/integration-tests.sh \
--stack-config '${{ inputs.stack-config }}' \
--provider '${{ inputs.provider }}' \
--test-types '${{ inputs.test-types }}' \
--test-subdirs '${{ inputs.test-subdirs }}' \
--test-pattern '${{ inputs.test-pattern }}' \
--inference-mode '${{ inputs.inference-mode }}' \
${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }}
${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
| tee pytest-${{ inputs.inference-mode }}.log
- name: Commit and push recordings
@ -57,10 +63,10 @@ runs:
git commit -m "Recordings update from CI"
fi
git fetch origin ${{ github.event.pull_request.head.ref }}
git rebase origin/${{ github.event.pull_request.head.ref }}
git fetch origin ${{ github.ref_name }}
git rebase origin/${{ github.ref_name }}
echo "Rebased successfully"
git push origin HEAD:${{ github.event.pull_request.head.ref }}
git push origin HEAD:${{ github.ref_name }}
echo "Pushed successfully"
else
echo "No recording changes"

View file

@ -16,14 +16,16 @@ runs:
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
with:
python-version: ${{ inputs.python-version }}
activate-environment: true
version: 0.7.6
- name: Install dependencies
shell: bash
run: |
echo "Updating project dependencies via uv sync"
uv sync --all-groups
uv pip install ollama faiss-cpu
echo "Installing ad-hoc dependencies"
uv pip install faiss-cpu
# Install llama-stack-client-python based on the client-version input
if [ "${{ inputs.client-version }}" = "latest" ]; then
@ -37,4 +39,5 @@ runs:
exit 1
fi
uv pip install -e .
echo "Installed llama packages"
uv pip list | grep llama

View file

@ -53,7 +53,22 @@ runs:
- name: Build Llama Stack
shell: bash
run: |
uv run llama stack build --template ci-tests --image-type venv
# Install llama-stack-client-python based on the client-version input
if [ "${{ inputs.client-version }}" = "latest" ]; then
echo "Installing latest llama-stack-client-python from main branch"
export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
elif [ "${{ inputs.client-version }}" = "published" ]; then
echo "Installing published llama-stack-client-python from PyPI"
unset LLAMA_STACK_CLIENT_DIR
else
echo "Invalid client-version: ${{ inputs.client-version }}"
exit 1
fi
echo "Building Llama Stack"
LLAMA_STACK_DIR=. \
uv run --no-sync llama stack build --template ci-tests --image-type venv
- name: Configure git for commits
shell: bash

View file

@ -9,6 +9,7 @@ updates:
day: "saturday"
commit-message:
prefix: chore(github-deps)
- package-ecosystem: "uv"
directory: "/"
schedule:
@ -19,3 +20,14 @@ updates:
- python
commit-message:
prefix: chore(python-deps)
- package-ecosystem: npm
directory: "/llama_stack/ui"
schedule:
interval: "weekly"
day: "saturday"
labels:
- type/dependencies
- javascript
commit-message:
prefix: chore(ui-deps)

View file

@ -18,5 +18,6 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
| UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
| Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |

View file

@ -17,7 +17,7 @@ jobs:
pull-requests: write # for peter-evans/create-pull-request to create a PR
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
ref: main
fetch-depth: 0

View file

@ -16,21 +16,22 @@ jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
- name: Run ShellCheck on install.sh
run: shellcheck scripts/install.sh
smoke-test-on-dev:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Build a single provider
run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template starter --image-type container --image-name test
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
llama stack build --template starter --image-type container --image-name test
- name: Run installer end-to-end
run: |

View file

@ -10,6 +10,7 @@ on:
paths:
- 'distributions/**'
- 'llama_stack/**'
- '!llama_stack/ui/**'
- 'tests/integration/**'
- 'uv.lock'
- 'pyproject.toml'
@ -17,7 +18,7 @@ on:
- '.github/workflows/integration-auth-tests.yml' # This workflow
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
jobs:
@ -30,7 +31,7 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner

View file

@ -16,7 +16,7 @@ on:
- '.github/workflows/integration-sql-store-tests.yml' # This workflow
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
jobs:
@ -44,7 +44,7 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner

View file

@ -5,11 +5,12 @@ run-name: Run the integration test suite from tests/integration in replay mode
on:
push:
branches: [ main ]
pull_request_target:
pull_request:
branches: [ main ]
types: [opened, synchronize, reopened]
paths:
- 'llama_stack/**'
- '!llama_stack/ui/**'
- 'tests/**'
- 'uv.lock'
- 'pyproject.toml'
@ -31,35 +32,23 @@ on:
description: 'Test against a specific provider'
type: string
default: 'ollama'
test-subdirs:
description: 'Comma-separated list of test subdirectories to run'
type: string
default: ''
test-pattern:
description: 'Regex pattern to pass to pytest -k'
type: string
default: ''
concurrency:
# Skip concurrency for pushes to main - each commit should be tested independently
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number }}
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
jobs:
discover-tests:
runs-on: ubuntu-latest
outputs:
test-types: ${{ steps.generate-test-types.outputs.test-types }}
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Generate test types
id: generate-test-types
run: |
# Get test directories dynamically, excluding non-test directories
# NOTE: we are excluding post_training since the tests take too long
TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
sed 's|tests/integration/||' |
grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
sort | jq -R -s -c 'split("\n")[:-1]')
echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
run-replay-mode-tests:
needs: discover-tests
runs-on: ubuntu-latest
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
@ -76,7 +65,7 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Setup test environment
uses: ./.github/actions/setup-test-environment
@ -90,7 +79,8 @@ jobs:
- name: Run tests
uses: ./.github/actions/run-and-record-tests
with:
test-types: ${{ needs.discover-tests.outputs.test-types }}
test-subdirs: ${{ inputs.test-subdirs }}
test-pattern: ${{ inputs.test-pattern }}
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
provider: ${{ matrix.provider }}
inference-mode: 'replay'

View file

@ -9,14 +9,17 @@ on:
branches: [ main ]
paths:
- 'llama_stack/**'
- '!llama_stack/ui/**'
- 'tests/integration/vector_io/**'
- 'uv.lock'
- 'pyproject.toml'
- 'requirements.txt'
- '.github/workflows/integration-vector-io-tests.yml' # This workflow
schedule:
- cron: '0 0 * * *' # (test on python 3.13) Daily at 12 AM UTC
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
jobs:
@ -25,12 +28,12 @@ jobs:
strategy:
matrix:
vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
python-version: ["3.12", "3.13"]
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
fail-fast: false # we want to run all tests regardless of failure
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
@ -141,7 +144,7 @@ jobs:
- name: Build Llama Stack
run: |
uv run llama stack build --template ci-tests --image-type venv
uv run --no-sync llama stack build --template ci-tests --image-type venv
- name: Check Storage and Memory Available Before Tests
if: ${{ always() }}
@ -164,7 +167,8 @@ jobs:
ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
run: |
uv run pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
uv run --no-sync \
pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
tests/integration/vector_io \
--embedding-model inline::sentence-transformers/all-MiniLM-L6-v2

View file

@ -8,7 +8,7 @@ on:
branches: [main]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
jobs:
@ -20,7 +20,7 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
# For dependabot PRs, we need to checkout with a token that can push changes
token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
@ -36,6 +36,17 @@ jobs:
**/requirements*.txt
.pre-commit-config.yaml
- name: Set up Node.js
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
with:
node-version: '20'
cache: 'npm'
cache-dependency-path: 'llama_stack/ui/'
- name: Install npm dependencies
run: npm ci
working-directory: llama_stack/ui
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
continue-on-error: true
env:

View file

@ -26,7 +26,7 @@ on:
- 'pyproject.toml'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
jobs:
@ -36,7 +36,7 @@ jobs:
distros: ${{ steps.set-matrix.outputs.distros }}
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Generate Distribution List
id: set-matrix
@ -55,7 +55,7 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
@ -79,7 +79,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
@ -92,7 +92,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
@ -106,6 +106,10 @@ jobs:
- name: Inspect the container image entrypoint
run: |
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
if [ -z "$IMAGE_ID" ]; then
echo "No image found"
exit 1
fi
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
@ -117,7 +121,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
@ -140,6 +144,10 @@ jobs:
- name: Inspect UBI9 image
run: |
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
if [ -z "$IMAGE_ID" ]; then
echo "No image found"
exit 1
fi
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then

View file

@ -9,6 +9,8 @@ on:
pull_request:
branches:
- main
paths-ignore:
- 'llama_stack/ui/**'
jobs:
build:
@ -19,10 +21,10 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install uv
uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
uses: astral-sh/setup-uv@4959332f0f014c5280e7eac8b70c90cb574c9f9b # v6.6.0
with:
python-version: ${{ matrix.python-version }}
activate-environment: true

View file

@ -1,93 +1,53 @@
# This workflow should be run manually when needing to re-record tests. This happens when you have
# - added a new test
# - or changed an existing test such that a new inference call is made
# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the
# tests and commit the recordings to the PR branch.
name: Integration Tests (Record)
run-name: Run the integration test suite from tests/integration
on:
pull_request:
branches: [ main ]
types: [opened, synchronize, labeled]
paths:
- 'llama_stack/**'
- 'tests/**'
- 'uv.lock'
- 'pyproject.toml'
- '.github/workflows/record-integration-tests.yml' # This workflow
- '.github/actions/setup-ollama/action.yml'
- '.github/actions/setup-test-environment/action.yml'
- '.github/actions/run-and-record-tests/action.yml'
workflow_dispatch:
inputs:
test-subdirs:
description: 'Comma-separated list of test subdirectories to run'
type: string
default: ''
test-provider:
description: 'Test against a specific provider'
type: string
default: 'ollama'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
run-vision-tests:
description: 'Whether to run vision tests'
type: boolean
default: false
test-pattern:
description: 'Regex pattern to pass to pytest -k'
type: string
default: ''
jobs:
discover-tests:
if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
runs-on: ubuntu-latest
outputs:
test-types: ${{ steps.generate-test-types.outputs.test-types }}
matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Generate test types
id: generate-test-types
run: |
# Get test directories dynamically, excluding non-test directories
TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
sort | jq -R -s -c 'split("\n")[:-1]')
echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
echo "labels=$labels"
modes_array=()
if [[ $labels == *"re-record-vision-tests"* ]]; then
modes_array+=("vision")
fi
if [[ $labels == *"re-record-tests"* ]]; then
modes_array+=("non-vision")
fi
# Convert to JSON array
if [ ${#modes_array[@]} -eq 0 ]; then
matrix_modes="[]"
else
matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]')
fi
echo "matrix_modes=$matrix_modes"
echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT
env:
GH_TOKEN: ${{ github.token }}
record-tests:
needs: discover-tests
runs-on: ubuntu-latest
permissions:
contents: write
strategy:
fail-fast: false
matrix:
mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}
steps:
- name: Echo workflow inputs
run: |
echo "::group::Workflow Inputs"
echo "test-subdirs: ${{ inputs.test-subdirs }}"
echo "test-provider: ${{ inputs.test-provider }}"
echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
echo "test-pattern: ${{ inputs.test-pattern }}"
echo "branch: ${{ github.ref_name }}"
echo "::endgroup::"
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
ref: ${{ github.event.pull_request.head.ref }}
fetch-depth: 0
- name: Setup test environment
@ -96,14 +56,15 @@ jobs:
python-version: "3.12" # Use single Python version for recording
client-version: "latest"
provider: ${{ inputs.test-provider || 'ollama' }}
run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
run-vision-tests: ${{ inputs.run-vision-tests }}
inference-mode: 'record'
- name: Run and record tests
uses: ./.github/actions/run-and-record-tests
with:
test-types: ${{ needs.discover-tests.outputs.test-types }}
test-pattern: ${{ inputs.test-pattern }}
test-subdirs: ${{ inputs.test-subdirs }}
stack-config: 'server:ci-tests' # recording must be done with server since more tests are run
provider: ${{ inputs.test-provider || 'ollama' }}
inference-mode: 'record'
run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
run-vision-tests: ${{ inputs.run-vision-tests }}

View file

@ -22,6 +22,6 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Check PR Title's semantic conformance
uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
uses: amannn/action-semantic-pull-request@48f256284bd46cdaab1048c3721360e808335d50 # v6.1.1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View file

@ -27,7 +27,7 @@ jobs:
# container and point 'uv pip install' to the correct path...
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner

View file

@ -9,6 +9,7 @@ on:
branches: [ main ]
paths:
- 'llama_stack/**'
- '!llama_stack/ui/**'
- 'tests/integration/**'
- 'uv.lock'
- 'pyproject.toml'
@ -26,7 +27,7 @@ jobs:
# container and point 'uv pip install' to the correct path...
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
@ -43,11 +44,11 @@ jobs:
- name: Print distro dependencies
run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml --print-deps-only
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
- name: Build distro from config file
run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
- name: Start Llama Stack server in background
if: ${{ matrix.image-type }} == 'venv'

55
.github/workflows/ui-unit-tests.yml vendored Normal file
View file

@ -0,0 +1,55 @@
name: UI Tests
run-name: Run the UI test suite
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
paths:
- 'llama_stack/ui/**'
- '.github/workflows/ui-unit-tests.yml' # This workflow
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
jobs:
ui-tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
node-version: [22]
steps:
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Setup Node.js
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
with:
node-version: ${{ matrix.node-version }}
cache: 'npm'
cache-dependency-path: 'llama_stack/ui/package-lock.json'
- name: Install dependencies
working-directory: llama_stack/ui
run: npm ci
- name: Run linting
working-directory: llama_stack/ui
run: npm run lint
- name: Run format check
working-directory: llama_stack/ui
run: npm run format:check
- name: Run unit tests
working-directory: llama_stack/ui
env:
CI: true
run: npm test -- --coverage --watchAll=false --passWithNoTests

View file

@ -9,6 +9,7 @@ on:
branches: [ main ]
paths:
- 'llama_stack/**'
- '!llama_stack/ui/**'
- 'tests/unit/**'
- 'uv.lock'
- 'pyproject.toml'
@ -17,7 +18,7 @@ on:
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
jobs:
@ -31,7 +32,7 @@ jobs:
- "3.13"
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner

View file

@ -27,7 +27,7 @@ on:
- '.github/workflows/update-readthedocs.yml'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
jobs:
@ -37,7 +37,7 @@ jobs:
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner

View file

@ -2,6 +2,7 @@ exclude: 'build/'
default_language_version:
python: python3.12
node: "22"
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
@ -145,6 +146,32 @@ repos:
pass_filenames: false
require_serial: true
files: ^.github/workflows/.*$
- id: ui-linter
name: Format & Lint UI
entry: bash ./scripts/run-ui-linter.sh
language: system
files: ^llama_stack/ui/.*\.(ts|tsx)$
pass_filenames: false
require_serial: true
- id: check-log-usage
name: Ensure 'llama_stack.log' usage for logging
entry: bash
language: system
types: [python]
pass_filenames: true
args:
- -c
- |
matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
if [ -n "$matches" ]; then
# GitHub Actions annotation format
while IFS=: read -r file line_num rest; do
echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
done <<< "$matches"
exit 1
fi
exit 0
ci:
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks

View file

@ -1,8 +1,5 @@
# Llama Stack
<a href="https://trendshift.io/repositories/11824" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11824" alt="meta-llama%2Fllama-stack | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-----
[![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
[![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)

View file

@ -4605,6 +4605,49 @@
}
}
},
"/v1/inference/rerank": {
"post": {
"responses": {
"200": {
"description": "RerankResponse with indices sorted by relevance score (descending).",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/RerankResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Inference"
],
"description": "Rerank a list of documents based on their relevance to a query.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/RerankRequest"
}
}
},
"required": true
}
}
},
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": {
"post": {
"responses": {
@ -8821,6 +8864,61 @@
"title": "OpenAIResponseOutputMessageMCPListTools",
"description": "MCP list tools output message containing available tools from an MCP server."
},
"OpenAIResponseContentPart": {
"oneOf": [
{
"$ref": "#/components/schemas/OpenAIResponseContentPartOutputText"
},
{
"$ref": "#/components/schemas/OpenAIResponseContentPartRefusal"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"output_text": "#/components/schemas/OpenAIResponseContentPartOutputText",
"refusal": "#/components/schemas/OpenAIResponseContentPartRefusal"
}
}
},
"OpenAIResponseContentPartOutputText": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "output_text",
"default": "output_text"
},
"text": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"type",
"text"
],
"title": "OpenAIResponseContentPartOutputText"
},
"OpenAIResponseContentPartRefusal": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "refusal",
"default": "refusal"
},
"refusal": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"type",
"refusal"
],
"title": "OpenAIResponseContentPartRefusal"
},
"OpenAIResponseObjectStream": {
"oneOf": [
{
@ -8877,6 +8975,12 @@
{
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
},
{
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded"
},
{
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone"
},
{
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
}
@ -8902,6 +9006,8 @@
"response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
"response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
"response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
"response.content_part.added": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded",
"response.content_part.done": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone",
"response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
}
}
@ -8928,6 +9034,80 @@
"title": "OpenAIResponseObjectStreamResponseCompleted",
"description": "Streaming event indicating a response has been completed."
},
"OpenAIResponseObjectStreamResponseContentPartAdded": {
"type": "object",
"properties": {
"response_id": {
"type": "string",
"description": "Unique identifier of the response containing this content"
},
"item_id": {
"type": "string",
"description": "Unique identifier of the output item containing this content part"
},
"part": {
"$ref": "#/components/schemas/OpenAIResponseContentPart",
"description": "The content part that was added"
},
"sequence_number": {
"type": "integer",
"description": "Sequential number for ordering streaming events"
},
"type": {
"type": "string",
"const": "response.content_part.added",
"default": "response.content_part.added",
"description": "Event type identifier, always \"response.content_part.added\""
}
},
"additionalProperties": false,
"required": [
"response_id",
"item_id",
"part",
"sequence_number",
"type"
],
"title": "OpenAIResponseObjectStreamResponseContentPartAdded",
"description": "Streaming event for when a new content part is added to a response item."
},
"OpenAIResponseObjectStreamResponseContentPartDone": {
"type": "object",
"properties": {
"response_id": {
"type": "string",
"description": "Unique identifier of the response containing this content"
},
"item_id": {
"type": "string",
"description": "Unique identifier of the output item containing this content part"
},
"part": {
"$ref": "#/components/schemas/OpenAIResponseContentPart",
"description": "The completed content part"
},
"sequence_number": {
"type": "integer",
"description": "Sequential number for ordering streaming events"
},
"type": {
"type": "string",
"const": "response.content_part.done",
"default": "response.content_part.done",
"description": "Event type identifier, always \"response.content_part.done\""
}
},
"additionalProperties": false,
"required": [
"response_id",
"item_id",
"part",
"sequence_number",
"type"
],
"title": "OpenAIResponseObjectStreamResponseContentPartDone",
"description": "Streaming event for when a content part is completed."
},
"OpenAIResponseObjectStreamResponseCreated": {
"type": "object",
"properties": {
@ -14630,7 +14810,8 @@
"OpenAIFilePurpose": {
"type": "string",
"enum": [
"assistants"
"assistants",
"batch"
],
"title": "OpenAIFilePurpose",
"description": "Valid purpose values for OpenAI Files API."
@ -14707,7 +14888,8 @@
"purpose": {
"type": "string",
"enum": [
"assistants"
"assistants",
"batch"
],
"description": "The intended purpose of the file"
}
@ -15926,12 +16108,16 @@
"value": {
"type": "number",
"description": "The numeric value of the metric at this timestamp"
},
"unit": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"timestamp",
"value"
"value",
"unit"
],
"title": "MetricDataPoint",
"description": "A single data point in a metric time series."
@ -16489,6 +16675,95 @@
],
"title": "RegisterVectorDbRequest"
},
"RerankRequest": {
"type": "object",
"properties": {
"model": {
"type": "string",
"description": "The identifier of the reranking model to use."
},
"query": {
"oneOf": [
{
"type": "string"
},
{
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
},
{
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
}
],
"description": "The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length."
},
"items": {
"type": "array",
"items": {
"oneOf": [
{
"type": "string"
},
{
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
},
{
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
}
]
},
"description": "List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length."
},
"max_num_results": {
"type": "integer",
"description": "(Optional) Maximum number of results to return. Default: returns all."
}
},
"additionalProperties": false,
"required": [
"model",
"query",
"items"
],
"title": "RerankRequest"
},
"RerankData": {
"type": "object",
"properties": {
"index": {
"type": "integer",
"description": "The original index of the document in the input list"
},
"relevance_score": {
"type": "number",
"description": "The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance."
}
},
"additionalProperties": false,
"required": [
"index",
"relevance_score"
],
"title": "RerankData",
"description": "A single rerank result from a reranking response."
},
"RerankResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/RerankData"
},
"description": "List of rerank result objects, sorted by relevance score (descending)"
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "RerankResponse",
"description": "Response from a reranking request."
},
"ResumeAgentTurnRequest": {
"type": "object",
"properties": {

View file

@ -3264,6 +3264,37 @@ paths:
schema:
$ref: '#/components/schemas/QueryTracesRequest'
required: true
/v1/inference/rerank:
post:
responses:
'200':
description: >-
RerankResponse with indices sorted by relevance score (descending).
content:
application/json:
schema:
$ref: '#/components/schemas/RerankResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
description: >-
Rerank a list of documents based on their relevance to a query.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/RerankRequest'
required: true
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume:
post:
responses:
@ -6441,6 +6472,43 @@ components:
title: OpenAIResponseOutputMessageMCPListTools
description: >-
MCP list tools output message containing available tools from an MCP server.
OpenAIResponseContentPart:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseContentPartOutputText'
- $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
discriminator:
propertyName: type
mapping:
output_text: '#/components/schemas/OpenAIResponseContentPartOutputText'
refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
OpenAIResponseContentPartOutputText:
type: object
properties:
type:
type: string
const: output_text
default: output_text
text:
type: string
additionalProperties: false
required:
- type
- text
title: OpenAIResponseContentPartOutputText
OpenAIResponseContentPartRefusal:
type: object
properties:
type:
type: string
const: refusal
default: refusal
refusal:
type: string
additionalProperties: false
required:
- type
- refusal
title: OpenAIResponseContentPartRefusal
OpenAIResponseObjectStream:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
@ -6461,6 +6529,8 @@ components:
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
discriminator:
propertyName: type
@ -6483,6 +6553,8 @@ components:
response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
response.content_part.added: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
response.content_part.done: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
"OpenAIResponseObjectStreamResponseCompleted":
type: object
@ -6504,6 +6576,76 @@ components:
OpenAIResponseObjectStreamResponseCompleted
description: >-
Streaming event indicating a response has been completed.
"OpenAIResponseObjectStreamResponseContentPartAdded":
type: object
properties:
response_id:
type: string
description: >-
Unique identifier of the response containing this content
item_id:
type: string
description: >-
Unique identifier of the output item containing this content part
part:
$ref: '#/components/schemas/OpenAIResponseContentPart'
description: The content part that was added
sequence_number:
type: integer
description: >-
Sequential number for ordering streaming events
type:
type: string
const: response.content_part.added
default: response.content_part.added
description: >-
Event type identifier, always "response.content_part.added"
additionalProperties: false
required:
- response_id
- item_id
- part
- sequence_number
- type
title: >-
OpenAIResponseObjectStreamResponseContentPartAdded
description: >-
Streaming event for when a new content part is added to a response item.
"OpenAIResponseObjectStreamResponseContentPartDone":
type: object
properties:
response_id:
type: string
description: >-
Unique identifier of the response containing this content
item_id:
type: string
description: >-
Unique identifier of the output item containing this content part
part:
$ref: '#/components/schemas/OpenAIResponseContentPart'
description: The completed content part
sequence_number:
type: integer
description: >-
Sequential number for ordering streaming events
type:
type: string
const: response.content_part.done
default: response.content_part.done
description: >-
Event type identifier, always "response.content_part.done"
additionalProperties: false
required:
- response_id
- item_id
- part
- sequence_number
- type
title: >-
OpenAIResponseObjectStreamResponseContentPartDone
description: >-
Streaming event for when a content part is completed.
"OpenAIResponseObjectStreamResponseCreated":
type: object
properties:
@ -10840,6 +10982,7 @@ components:
type: string
enum:
- assistants
- batch
title: OpenAIFilePurpose
description: >-
Valid purpose values for OpenAI Files API.
@ -10908,6 +11051,7 @@ components:
type: string
enum:
- assistants
- batch
description: The intended purpose of the file
additionalProperties: false
required:
@ -11838,10 +11982,13 @@ components:
type: number
description: >-
The numeric value of the metric at this timestamp
unit:
type: string
additionalProperties: false
required:
- timestamp
- value
- unit
title: MetricDataPoint
description: >-
A single data point in a metric time series.
@ -12252,6 +12399,76 @@ components:
- vector_db_id
- embedding_model
title: RegisterVectorDbRequest
RerankRequest:
type: object
properties:
model:
type: string
description: >-
The identifier of the reranking model to use.
query:
oneOf:
- type: string
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
description: >-
The search query to rank items against. Can be a string, text content
part, or image content part. The input must not exceed the model's max
input token length.
items:
type: array
items:
oneOf:
- type: string
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
description: >-
List of items to rerank. Each item can be a string, text content part,
or image content part. Each input must not exceed the model's max input
token length.
max_num_results:
type: integer
description: >-
(Optional) Maximum number of results to return. Default: returns all.
additionalProperties: false
required:
- model
- query
- items
title: RerankRequest
RerankData:
type: object
properties:
index:
type: integer
description: >-
The original index of the document in the input list
relevance_score:
type: number
description: >-
The relevance score from the model output. Values are inverted when applicable
so that higher scores indicate greater relevance.
additionalProperties: false
required:
- index
- relevance_score
title: RerankData
description: >-
A single rerank result from a reranking response.
RerankResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/RerankData'
description: >-
List of rerank result objects, sorted by relevance score (descending)
additionalProperties: false
required:
- data
title: RerankResponse
description: Response from a reranking request.
ResumeAgentTurnRequest:
type: object
properties:

View file

@ -18,3 +18,4 @@ We are working on adding a few more APIs to complete the application lifecycle.
- **Batch Inference**: run inference on a dataset of inputs
- **Batch Agents**: run agents on a dataset of inputs
- **Synthetic Data Generation**: generate synthetic data for model development
- **Batches**: OpenAI-compatible batch management for inference

View file

@ -4,11 +4,11 @@
## Adding a New Provider
See the [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
See:
- [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
- [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
- [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
See the [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
See the [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
```{toctree}
:maxdepth: 1
:hidden:
@ -19,11 +19,21 @@ new_vector_database
## Testing
See the [Test Page](testing.md) which describes how to test your changes.
```{include} ../../../tests/README.md
```
## Advanced Topics
For developers who need deeper understanding of the testing system internals:
```{toctree}
:maxdepth: 1
:hidden:
:caption: Testing
testing
```
testing/record-replay
```
### Benchmarking
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
```

View file

@ -1,8 +0,0 @@
```{include} ../../../tests/README.md
```
```{include} ../../../tests/unit/README.md
```
```{include} ../../../tests/integration/README.md
```

View file

@ -0,0 +1,234 @@
# Record-Replay System
Understanding how Llama Stack captures and replays API interactions for testing.
## Overview
The record-replay system solves a fundamental challenge in AI testing: how do you test against expensive, non-deterministic APIs without breaking the bank or dealing with flaky tests?
The solution: intercept API calls, store real responses, and replay them later. This gives you real API behavior without the cost or variability.
## How It Works
### Request Hashing
Every API request gets converted to a deterministic hash for lookup:
```python
def normalize_request(method: str, url: str, headers: dict, body: dict) -> str:
normalized = {
"method": method.upper(),
"endpoint": urlparse(url).path, # Just the path, not full URL
"body": body, # Request parameters
}
return hashlib.sha256(json.dumps(normalized, sort_keys=True).encode()).hexdigest()
```
**Key insight:** The hashing is intentionally precise. Different whitespace, float precision, or parameter order produces different hashes. This prevents subtle bugs from false cache hits.
```python
# These produce DIFFERENT hashes:
{"content": "Hello world"}
{"content": "Hello world\n"}
{"temperature": 0.7}
{"temperature": 0.7000001}
```
### Client Interception
The system patches OpenAI and Ollama client methods to intercept calls before they leave your application. This happens transparently - your test code doesn't change.
### Storage Architecture
Recordings use a two-tier storage system optimized for both speed and debuggability:
```
recordings/
├── index.sqlite # Fast lookup by request hash
└── responses/
├── abc123def456.json # Individual response files
└── def789ghi012.json
```
**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
**JSON files** store complete request/response pairs in human-readable format for debugging.
## Recording Modes
### LIVE Mode
Direct API calls with no recording or replay:
```python
with inference_recording(mode=InferenceMode.LIVE):
response = await client.chat.completions.create(...)
```
Use for initial development and debugging against real APIs.
### RECORD Mode
Captures API interactions while passing through real responses:
```python
with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
response = await client.chat.completions.create(...)
# Real API call made, response captured AND returned
```
The recording process:
1. Request intercepted and hashed
2. Real API call executed
3. Response captured and serialized
4. Recording stored to disk
5. Original response returned to caller
### REPLAY Mode
Returns stored responses instead of making API calls:
```python
with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
response = await client.chat.completions.create(...)
# No API call made, cached response returned instantly
```
The replay process:
1. Request intercepted and hashed
2. Hash looked up in SQLite index
3. Response loaded from JSON file
4. Response deserialized and returned
5. Error if no recording found
## Streaming Support
Streaming APIs present a unique challenge: how do you capture an async generator?
### The Problem
```python
# How do you record this?
async for chunk in client.chat.completions.create(stream=True):
process(chunk)
```
### The Solution
The system captures all chunks immediately before yielding any:
```python
async def handle_streaming_record(response):
# Capture complete stream first
chunks = []
async for chunk in response:
chunks.append(chunk)
# Store complete recording
storage.store_recording(
request_hash, request_data, {"body": chunks, "is_streaming": True}
)
# Return generator that replays captured chunks
async def replay_stream():
for chunk in chunks:
yield chunk
return replay_stream()
```
This ensures:
- **Complete capture** - The entire stream is saved atomically
- **Interface preservation** - The returned object behaves like the original API
- **Deterministic replay** - Same chunks in the same order every time
## Serialization
API responses contain complex Pydantic objects that need careful serialization:
```python
def _serialize_response(response):
if hasattr(response, "model_dump"):
# Preserve type information for proper deserialization
return {
"__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
"__data__": response.model_dump(mode="json"),
}
return response
```
This preserves type safety - when replayed, you get the same Pydantic objects with all their validation and methods.
## Environment Integration
### Environment Variables
Control recording behavior globally:
```bash
export LLAMA_STACK_TEST_INFERENCE_MODE=replay
export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
pytest tests/integration/
```
### Pytest Integration
The system integrates automatically based on environment variables, requiring no changes to test code.
## Debugging Recordings
### Inspecting Storage
```bash
# See what's recorded
sqlite3 recordings/index.sqlite "SELECT endpoint, model, timestamp FROM recordings LIMIT 10;"
# View specific response
cat recordings/responses/abc123def456.json | jq '.response.body'
# Find recordings by endpoint
sqlite3 recordings/index.sqlite "SELECT * FROM recordings WHERE endpoint='/v1/chat/completions';"
```
### Common Issues
**Hash mismatches:** Request parameters changed slightly between record and replay
```bash
# Compare request details
cat recordings/responses/abc123.json | jq '.request'
```
**Serialization errors:** Response types changed between versions
```bash
# Re-record with updated types
rm recordings/responses/failing_hash.json
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_failing.py
```
**Missing recordings:** New test or changed parameters
```bash
# Record the missing interaction
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_new.py
```
## Design Decisions
### Why Not Mocks?
Traditional mocking breaks down with AI APIs because:
- Response structures are complex and evolve frequently
- Streaming behavior is hard to mock correctly
- Edge cases in real APIs get missed
- Mocks become brittle maintenance burdens
### Why Precise Hashing?
Loose hashing (normalizing whitespace, rounding floats) seems convenient but hides bugs. If a test changes slightly, you want to know about it rather than accidentally getting the wrong cached response.
### Why JSON + SQLite?
- **JSON** - Human readable, diff-friendly, easy to inspect and modify
- **SQLite** - Fast indexed lookups without loading response bodies
- **Hybrid** - Best of both worlds for different use cases
This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.

View file

@ -225,8 +225,32 @@ server:
port: 8321 # Port to listen on (default: 8321)
tls_certfile: "/path/to/cert.pem" # Optional: Path to TLS certificate for HTTPS
tls_keyfile: "/path/to/key.pem" # Optional: Path to TLS key for HTTPS
cors: true # Optional: Enable CORS (dev mode) or full config object
```
### CORS Configuration
CORS (Cross-Origin Resource Sharing) can be configured in two ways:
**Local development** (allows localhost origins only):
```yaml
server:
cors: true
```
**Explicit configuration** (custom origins and settings):
```yaml
server:
cors:
allow_origins: ["https://myapp.com", "https://app.example.com"]
allow_methods: ["GET", "POST", "PUT", "DELETE"]
allow_headers: ["Content-Type", "Authorization"]
allow_credentials: true
max_age: 3600
```
When `cors: true`, the server enables secure localhost-only access for local development. For production, specify exact origins to maintain security.
### Authentication Configuration
> **Breaking Change (v0.2.14)**: The authentication configuration structure has changed. The previous format with `provider_type` and `config` fields has been replaced with a unified `provider_config` field that includes the `type` field. Update your configuration files accordingly.
@ -618,6 +642,54 @@ Content-Type: application/json
}
```
### CORS Configuration
Configure CORS to allow web browsers to make requests from different domains. Disabled by default.
#### Quick Setup
For development, use the simple boolean flag:
```yaml
server:
cors: true # Auto-enables localhost with any port
```
This automatically allows `http://localhost:*` and `https://localhost:*` with secure defaults.
#### Custom Configuration
For specific origins and full control:
```yaml
server:
cors:
allow_origins: ["https://myapp.com", "https://staging.myapp.com"]
allow_credentials: true
allow_methods: ["GET", "POST", "PUT", "DELETE"]
allow_headers: ["Content-Type", "Authorization"]
allow_origin_regex: "https://.*\\.example\\.com" # Optional regex pattern
expose_headers: ["X-Total-Count"]
max_age: 86400
```
#### Configuration Options
| Field | Description | Default |
| -------------------- | ---------------------------------------------- | ------- |
| `allow_origins` | List of allowed origins. Use `["*"]` for any. | `["*"]` |
| `allow_origin_regex` | Regex pattern for allowed origins (optional). | `None` |
| `allow_methods` | Allowed HTTP methods. | `["*"]` |
| `allow_headers` | Allowed headers. | `["*"]` |
| `allow_credentials` | Allow credentials (cookies, auth headers). | `false` |
| `expose_headers` | Headers exposed to browser. | `[]` |
| `max_age` | Preflight cache time (seconds). | `600` |
**Security Notes**:
- `allow_credentials: true` requires explicit origins (no wildcards)
- `cors: true` enables localhost access only (secure for development)
- For public APIs, always specify exact allowed origins
## Extending to handle Safety
Configuring Safety can be a little involved so it is instructive to go through an example.

View file

@ -17,7 +17,6 @@ client = LlamaStackAsLibraryClient(
# provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
)
client.initialize()
```
This will parse your config and set up any inline implementations and remote clients needed for your implementation.
@ -32,5 +31,4 @@ If you've created a [custom distribution](https://llama-stack.readthedocs.io/en/
```python
client = LlamaStackAsLibraryClient(config_path)
client.initialize()
```

View file

@ -0,0 +1,156 @@
# Llama Stack Benchmark Suite on Kubernetes
## Motivation
Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
### Why This Benchmark Suite Exists
**Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
- Llama Stack inference (with vLLM backend)
- Direct vLLM inference calls
- Both under identical Kubernetes deployment conditions
**Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
**Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
**Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
- Kubernetes resource allocation (CPU, memory, GPU)
- Auto-scaling configurations
- Cost optimization strategies
### Key Metrics Captured
The benchmark suite measures critical performance indicators:
- **Throughput**: Requests per second under sustained load
- **Latency Distribution**: P50, P95, P99 response times
- **Time to First Token (TTFT)**: Critical for streaming applications
- **Error Rates**: Request failures and timeout analysis
This data enables data-driven architectural decisions and performance optimization efforts.
## Setup
**1. Deploy base k8s infrastructure:**
```bash
cd ../k8s
./apply.sh
```
**2. Deploy benchmark components:**
```bash
cd ../k8s-benchmark
./apply.sh
```
**3. Verify deployment:**
```bash
kubectl get pods
# Should see: llama-stack-benchmark-server, vllm-server, etc.
```
## Quick Start
### Basic Benchmarks
**Benchmark Llama Stack (default):**
```bash
cd docs/source/distributions/k8s-benchmark/
./run-benchmark.sh
```
**Benchmark vLLM direct:**
```bash
./run-benchmark.sh --target vllm
```
### Custom Configuration
**Extended benchmark with high concurrency:**
```bash
./run-benchmark.sh --target vllm --duration 120 --concurrent 20
```
**Short test run:**
```bash
./run-benchmark.sh --target stack --duration 30 --concurrent 5
```
## Command Reference
### run-benchmark.sh Options
```bash
./run-benchmark.sh [options]
Options:
-t, --target <stack|vllm> Target to benchmark (default: stack)
-d, --duration <seconds> Duration in seconds (default: 60)
-c, --concurrent <users> Number of concurrent users (default: 10)
-h, --help Show help message
Examples:
./run-benchmark.sh --target vllm # Benchmark vLLM direct
./run-benchmark.sh --target stack # Benchmark Llama Stack
./run-benchmark.sh -t vllm -d 120 -c 20 # vLLM with 120s, 20 users
```
## Local Testing
### Running Benchmark Locally
For local development without Kubernetes:
**1. Start OpenAI mock server:**
```bash
uv run python openai-mock-server.py --port 8080
```
**2. Run benchmark against mock server:**
```bash
uv run python benchmark.py \
--base-url http://localhost:8080/v1 \
--model mock-inference \
--duration 30 \
--concurrent 5
```
**3. Test against local vLLM server:**
```bash
# If you have vLLM running locally on port 8000
uv run python benchmark.py \
--base-url http://localhost:8000/v1 \
--model meta-llama/Llama-3.2-3B-Instruct \
--duration 30 \
--concurrent 5
```
**4. Profile the running server:**
```bash
./profile_running_server.sh
```
### OpenAI Mock Server
The `openai-mock-server.py` provides:
- **OpenAI-compatible API** for testing without real models
- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
- **Consistent responses** for reproducible benchmarks
- **Lightweight testing** without GPU requirements
**Mock server usage:**
```bash
uv run python openai-mock-server.py --port 8080
```
The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
## Files in this Directory
- `benchmark.py` - Core benchmark script with async streaming support
- `run-benchmark.sh` - Main script with target selection and configuration
- `openai-mock-server.py` - Mock OpenAI API server for local testing
- `README.md` - This documentation file

View file

@ -8,7 +8,6 @@
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
export MOCK_INFERENCE_PORT=8080
export STREAM_DELAY_SECONDS=0.005
export POSTGRES_USER=llamastack
@ -20,14 +19,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
export MOCK_INFERENCE_MODEL=mock-inference
# Use llama-stack-benchmark-service as the benchmark server
export LOCUST_HOST=http://llama-stack-benchmark-service:8323
export LOCUST_BASE_PATH=/v1/openai/v1
# Use vllm-service as the benchmark server
# export LOCUST_HOST=http://vllm-server:8000
# export LOCUST_BASE_PATH=/v1
export MOCK_INFERENCE_URL=openai-mock-service:8080
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
@ -35,13 +27,6 @@ set -euo pipefail
set -x
# Deploy benchmark-specific components
# Deploy OpenAI mock server
kubectl create configmap openai-mock --from-file=openai-mock-server.py \
--dry-run=client -o yaml | kubectl apply --validate=false -f -
envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f -
# Create configmap with our custom stack config
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
--dry-run=client -o yaml > stack-configmap.yaml
@ -49,9 +34,3 @@ kubectl apply --validate=false -f stack-configmap.yaml
# Deploy our custom llama stack server (overriding the base one)
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
# Deploy Locust load testing
kubectl create configmap locust-script --from-file=locustfile.py \
--dry-run=client -o yaml | kubectl apply --validate=false -f -
envsubst < locust-k8s.yaml | kubectl apply --validate=false -f -

View file

@ -0,0 +1,267 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Simple benchmark script for Llama Stack with OpenAI API compatibility.
"""
import argparse
import asyncio
import os
import random
import statistics
import time
from typing import Tuple
import aiohttp
class BenchmarkStats:
def __init__(self):
self.response_times = []
self.ttft_times = []
self.chunks_received = []
self.errors = []
self.success_count = 0
self.total_requests = 0
self.concurrent_users = 0
self.start_time = None
self.end_time = None
self._lock = asyncio.Lock()
async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
async with self._lock:
self.total_requests += 1
if error:
self.errors.append(error)
else:
self.success_count += 1
self.response_times.append(response_time)
self.chunks_received.append(chunks)
if ttft is not None:
self.ttft_times.append(ttft)
def print_summary(self):
if not self.response_times:
print("No successful requests to report")
if self.errors:
print(f"Total errors: {len(self.errors)}")
print("First 5 errors:")
for error in self.errors[:5]:
print(f" {error}")
return
total_time = self.end_time - self.start_time
success_rate = (self.success_count / self.total_requests) * 100
print(f"\n{'='*60}")
print(f"BENCHMARK RESULTS")
print(f"{'='*60}")
print(f"Total time: {total_time:.2f}s")
print(f"Concurrent users: {self.concurrent_users}")
print(f"Total requests: {self.total_requests}")
print(f"Successful requests: {self.success_count}")
print(f"Failed requests: {len(self.errors)}")
print(f"Success rate: {success_rate:.1f}%")
print(f"Requests per second: {self.success_count / total_time:.2f}")
print(f"\nResponse Time Statistics:")
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
print(f" Median: {statistics.median(self.response_times):.3f}s")
print(f" Min: {min(self.response_times):.3f}s")
print(f" Max: {max(self.response_times):.3f}s")
if len(self.response_times) > 1:
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
percentiles = [50, 90, 95, 99]
sorted_times = sorted(self.response_times)
print(f"\nPercentiles:")
for p in percentiles:
idx = int(len(sorted_times) * p / 100) - 1
idx = max(0, min(idx, len(sorted_times) - 1))
print(f" P{p}: {sorted_times[idx]:.3f}s")
if self.ttft_times:
print(f"\nTime to First Token (TTFT) Statistics:")
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
print(f" Min: {min(self.ttft_times):.3f}s")
print(f" Max: {max(self.ttft_times):.3f}s")
if len(self.ttft_times) > 1:
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
sorted_ttft = sorted(self.ttft_times)
print(f"\nTTFT Percentiles:")
for p in percentiles:
idx = int(len(sorted_ttft) * p / 100) - 1
idx = max(0, min(idx, len(sorted_ttft) - 1))
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
if self.chunks_received:
print(f"\nStreaming Statistics:")
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
print(f" Total chunks received: {sum(self.chunks_received)}")
if self.errors:
print(f"\nErrors (showing first 5):")
for error in self.errors[:5]:
print(f" {error}")
class LlamaStackBenchmark:
def __init__(self, base_url: str, model_id: str):
self.base_url = base_url.rstrip('/')
self.model_id = model_id
self.headers = {"Content-Type": "application/json"}
self.test_messages = [
[{"role": "user", "content": "Hi"}],
[{"role": "user", "content": "What is the capital of France?"}],
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
[
{"role": "user", "content": "What is machine learning?"},
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
{"role": "user", "content": "Can you give me a practical example?"}
]
]
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
"""Make a single async streaming chat completion request."""
messages = random.choice(self.test_messages)
payload = {
"model": self.model_id,
"messages": messages,
"stream": True,
"max_tokens": 100
}
start_time = time.time()
chunks_received = 0
ttft = None
error = None
session = aiohttp.ClientSession()
try:
async with session.post(
f"{self.base_url}/chat/completions",
headers=self.headers,
json=payload,
timeout=aiohttp.ClientTimeout(total=30)
) as response:
if response.status == 200:
async for line in response.content:
if line:
line_str = line.decode('utf-8').strip()
if line_str.startswith('data: '):
chunks_received += 1
if ttft is None:
ttft = time.time() - start_time
if line_str == 'data: [DONE]':
break
if chunks_received == 0:
error = "No streaming chunks received"
else:
text = await response.text()
error = f"HTTP {response.status}: {text[:100]}"
except Exception as e:
error = f"Request error: {str(e)}"
finally:
await session.close()
response_time = time.time() - start_time
return response_time, chunks_received, ttft, error
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
"""Run benchmark using async requests for specified duration."""
stats = BenchmarkStats()
stats.concurrent_users = concurrent_users
stats.start_time = time.time()
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
print(f"Target URL: {self.base_url}/chat/completions")
print(f"Model: {self.model_id}")
connector = aiohttp.TCPConnector(limit=concurrent_users)
async with aiohttp.ClientSession(connector=connector) as session:
async def worker(worker_id: int):
"""Worker that sends requests sequentially until canceled."""
request_count = 0
while True:
try:
response_time, chunks, ttft, error = await self.make_async_streaming_request()
await stats.add_result(response_time, chunks, ttft, error)
request_count += 1
except asyncio.CancelledError:
break
except Exception as e:
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
# Progress reporting task
async def progress_reporter():
last_report_time = time.time()
while True:
try:
await asyncio.sleep(1) # Report every second
if time.time() >= last_report_time + 10: # Report every 10 seconds
elapsed = time.time() - stats.start_time
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
last_report_time = time.time()
except asyncio.CancelledError:
break
# Spawn concurrent workers
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
progress_task = asyncio.create_task(progress_reporter())
tasks.append(progress_task)
# Wait for duration then cancel all tasks
await asyncio.sleep(duration)
for task in tasks:
task.cancel()
# Wait for all tasks to complete
await asyncio.gather(*tasks, return_exceptions=True)
stats.end_time = time.time()
return stats
def main():
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
help="Model ID to use for requests")
parser.add_argument("--duration", type=int, default=60,
help="Duration in seconds to run benchmark (default: 60)")
parser.add_argument("--concurrent", type=int, default=10,
help="Number of concurrent users (default: 10)")
args = parser.parse_args()
benchmark = LlamaStackBenchmark(args.base_url, args.model)
try:
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
stats.print_summary()
except KeyboardInterrupt:
print("\nBenchmark interrupted by user")
except Exception as e:
print(f"Benchmark failed: {e}")
if __name__ == "__main__":
main()

View file

@ -1,131 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: locust-master
labels:
app: locust
role: master
spec:
replicas: 1
selector:
matchLabels:
app: locust
role: master
template:
metadata:
labels:
app: locust
role: master
spec:
containers:
- name: locust-master
image: locustio/locust:2.31.8
ports:
- containerPort: 8089 # Web UI
- containerPort: 5557 # Master communication
env:
- name: LOCUST_HOST
value: "${LOCUST_HOST}"
- name: LOCUST_LOCUSTFILE
value: "/locust/locustfile.py"
- name: LOCUST_WEB_HOST
value: "0.0.0.0"
- name: LOCUST_MASTER
value: "true"
- name: LOCUST_BASE_PATH
value: "${LOCUST_BASE_PATH}"
- name: INFERENCE_MODEL
value: "${BENCHMARK_INFERENCE_MODEL}"
volumeMounts:
- name: locust-script
mountPath: /locust
command: ["locust"]
args:
- "--master"
- "--web-host=0.0.0.0"
- "--web-port=8089"
- "--host=${LOCUST_HOST}"
- "--locustfile=/locust/locustfile.py"
volumes:
- name: locust-script
configMap:
name: locust-script
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: locust-worker
labels:
app: locust
role: worker
spec:
replicas: 2 # Start with 2 workers, can be scaled up
selector:
matchLabels:
app: locust
role: worker
template:
metadata:
labels:
app: locust
role: worker
spec:
containers:
- name: locust-worker
image: locustio/locust:2.31.8
env:
- name: LOCUST_HOST
value: "${LOCUST_HOST}"
- name: LOCUST_LOCUSTFILE
value: "/locust/locustfile.py"
- name: LOCUST_MASTER_HOST
value: "locust-master-service"
- name: LOCUST_MASTER_PORT
value: "5557"
- name: INFERENCE_MODEL
value: "${BENCHMARK_INFERENCE_MODEL}"
- name: LOCUST_BASE_PATH
value: "${LOCUST_BASE_PATH}"
volumeMounts:
- name: locust-script
mountPath: /locust
command: ["locust"]
args:
- "--worker"
- "--master-host=locust-master-service"
- "--master-port=5557"
- "--locustfile=/locust/locustfile.py"
volumes:
- name: locust-script
configMap:
name: locust-script
---
apiVersion: v1
kind: Service
metadata:
name: locust-master-service
spec:
selector:
app: locust
role: master
ports:
- name: web-ui
port: 8089
targetPort: 8089
- name: master-comm
port: 5557
targetPort: 5557
type: ClusterIP
---
apiVersion: v1
kind: Service
metadata:
name: locust-web-ui
spec:
selector:
app: locust
role: master
ports:
- port: 8089
targetPort: 8089
type: ClusterIP # Keep internal, use port-forward to access

View file

@ -1,78 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Locust load testing script for Llama Stack with Prism mock OpenAI provider.
"""
import random
from locust import HttpUser, task, between
import os
base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
MODEL_ID = os.getenv("INFERENCE_MODEL")
class LlamaStackUser(HttpUser):
wait_time = between(0.0, 0.0001)
def on_start(self):
"""Setup authentication and test data."""
# No auth required for benchmark server
self.headers = {
"Content-Type": "application/json"
}
# Test messages of varying lengths
self.test_messages = [
[{"role": "user", "content": "Hi"}],
[{"role": "user", "content": "What is the capital of France?"}],
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
[
{"role": "user", "content": "What is machine learning?"},
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
{"role": "user", "content": "Can you give me a practical example?"}
]
]
@task(weight=100)
def chat_completion_streaming(self):
"""Test streaming chat completion (20% of requests)."""
messages = random.choice(self.test_messages)
payload = {
"model": MODEL_ID,
"messages": messages,
"stream": True,
"max_tokens": 100
}
with self.client.post(
f"{base_path}/chat/completions",
headers=self.headers,
json=payload,
stream=True,
catch_response=True
) as response:
if response.status_code == 200:
chunks_received = 0
try:
for line in response.iter_lines():
if line:
line_str = line.decode('utf-8')
if line_str.startswith('data: '):
chunks_received += 1
if line_str.strip() == 'data: [DONE]':
break
if chunks_received > 0:
response.success()
else:
response.failure("No streaming chunks received")
except Exception as e:
response.failure(f"Streaming error: {e}")
else:
response.failure(f"HTTP {response.status_code}: {response.text}")

View file

@ -1,52 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: openai-mock
labels:
app: openai-mock
spec:
replicas: 1
selector:
matchLabels:
app: openai-mock
template:
metadata:
labels:
app: openai-mock
spec:
containers:
- name: openai-mock
image: python:3.12-slim
ports:
- containerPort: ${MOCK_INFERENCE_PORT}
env:
- name: PORT
value: "${MOCK_INFERENCE_PORT}"
- name: MOCK_MODELS
value: "${MOCK_INFERENCE_MODEL}"
- name: STREAM_DELAY_SECONDS
value: "${STREAM_DELAY_SECONDS}"
command: ["sh", "-c"]
args:
- |
pip install flask &&
python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT}
volumeMounts:
- name: openai-mock-script
mountPath: /app
volumes:
- name: openai-mock-script
configMap:
name: openai-mock
---
apiVersion: v1
kind: Service
metadata:
name: openai-mock-service
spec:
selector:
app: openai-mock
ports:
- port: 8080
targetPort: 8080
type: ClusterIP

View file

@ -23,7 +23,7 @@ app = Flask(__name__)
# Models from environment variables
def get_models():
models_str = os.getenv("MOCK_MODELS", "mock-inference")
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
return {
@ -49,13 +49,13 @@ def generate_random_text(length=50):
]
return " ".join(random.choices(words, k=length))
@app.route('/models', methods=['GET'])
@app.route('/v1/models', methods=['GET'])
def list_models():
models = get_models()
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
return jsonify(models)
@app.route('/chat/completions', methods=['POST'])
@app.route('/v1/chat/completions', methods=['POST'])
def chat_completions():
"""Return OpenAI-formatted chat completion responses."""
data = request.get_json()

View file

@ -0,0 +1,52 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Script to profile an already running Llama Stack server
# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
DURATION=${1:-60} # Default 60 seconds
OUTPUT_FILE=${2:-"llama_stack_profile"} # Default output file
echo "Looking for running Llama Stack server..."
# Find the server PID
SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
if [ -z "$SERVER_PID" ]; then
echo "Error: No running Llama Stack server found"
echo "Please start your server first with:"
echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
exit 1
fi
echo "Found Llama Stack server with PID: $SERVER_PID"
# Start py-spy profiling
echo "Starting py-spy profiling for ${DURATION} seconds..."
echo "Output will be saved to: ${OUTPUT_FILE}.svg"
echo ""
echo "You can now run your load test..."
echo ""
# Get the full path to py-spy
PYSPY_PATH=$(which py-spy)
# Check if running as root, if not, use sudo
if [ "$EUID" -ne 0 ]; then
echo "py-spy requires root permissions on macOS. Running with sudo..."
sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
else
"$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
fi
echo ""
echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
echo ""
echo "To view the flame graph:"
echo "open ${OUTPUT_FILE}.svg"

View file

@ -0,0 +1,148 @@
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
set -euo pipefail
# Default values
TARGET="stack"
DURATION=60
CONCURRENT=10
# Parse command line arguments
usage() {
echo "Usage: $0 [options]"
echo "Options:"
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
echo " -d, --duration <seconds> Duration in seconds (default: 60)"
echo " -c, --concurrent <users> Number of concurrent users (default: 10)"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " $0 --target vllm # Benchmark vLLM direct"
echo " $0 --target stack # Benchmark Llama Stack (default)"
echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users"
}
while [[ $# -gt 0 ]]; do
case $1 in
-t|--target)
TARGET="$2"
shift 2
;;
-d|--duration)
DURATION="$2"
shift 2
;;
-c|--concurrent)
CONCURRENT="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown option: $1"
usage
exit 1
;;
esac
done
# Validate target
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
echo "Error: Target must be 'stack' or 'vllm'"
usage
exit 1
fi
# Set configuration based on target
if [[ "$TARGET" == "vllm" ]]; then
BASE_URL="http://vllm-server:8000/v1"
JOB_NAME="vllm-benchmark-job"
echo "Benchmarking vLLM direct..."
else
BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
JOB_NAME="stack-benchmark-job"
echo "Benchmarking Llama Stack..."
fi
echo "Configuration:"
echo " Target: $TARGET"
echo " Base URL: $BASE_URL"
echo " Duration: ${DURATION}s"
echo " Concurrent users: $CONCURRENT"
echo ""
# Create temporary job yaml
TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
cat > "$TEMP_YAML" << EOF
apiVersion: batch/v1
kind: Job
metadata:
name: $JOB_NAME
namespace: default
spec:
template:
spec:
containers:
- name: benchmark
image: python:3.11-slim
command: ["/bin/bash"]
args:
- "-c"
- |
pip install aiohttp &&
python3 /benchmark/benchmark.py \\
--base-url $BASE_URL \\
--model \${INFERENCE_MODEL} \\
--duration $DURATION \\
--concurrent $CONCURRENT
env:
- name: INFERENCE_MODEL
value: "meta-llama/Llama-3.2-3B-Instruct"
volumeMounts:
- name: benchmark-script
mountPath: /benchmark
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
volumes:
- name: benchmark-script
configMap:
name: benchmark-script
restartPolicy: Never
backoffLimit: 3
EOF
echo "Creating benchmark ConfigMap..."
kubectl create configmap benchmark-script \
--from-file=benchmark.py=benchmark.py \
--dry-run=client -o yaml | kubectl apply -f -
echo "Cleaning up any existing benchmark job..."
kubectl delete job $JOB_NAME 2>/dev/null || true
echo "Deploying benchmark Job..."
kubectl apply -f "$TEMP_YAML"
echo "Waiting for job to start..."
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
echo "Following benchmark logs..."
kubectl logs -f job/$JOB_NAME
echo "Job completed. Checking final status..."
kubectl get job $JOB_NAME
# Clean up temporary file
rm -f "$TEMP_YAML"

View file

@ -26,13 +26,6 @@ data:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: mock-vllm-inference
provider_type: remote::vllm
config:
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
max_tokens: 4096
api_token: fake
tls_verify: false
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -121,9 +114,6 @@ data:
- model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
model_type: llm
- model_id: ${env.MOCK_INFERENCE_MODEL}
provider_id: mock-vllm-inference
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []

View file

@ -44,8 +44,6 @@ spec:
value: "${SAFETY_MODEL}"
- name: TAVILY_SEARCH_API_KEY
value: "${TAVILY_SEARCH_API_KEY}"
- name: MOCK_INFERENCE_PORT
value: "${MOCK_INFERENCE_PORT}"
- name: VLLM_URL
value: http://vllm-server.default.svc.cluster.local:8000/v1
- name: VLLM_MAX_TOKENS
@ -54,8 +52,6 @@ spec:
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
- name: VLLM_TLS_VERIFY
value: "false"
- name: MOCK_INFERENCE_MODEL
value: "${MOCK_INFERENCE_MODEL}"
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
ports:
- containerPort: 8323

View file

@ -3,7 +3,6 @@ image_name: kubernetes-benchmark-demo
apis:
- agents
- inference
- safety
- telemetry
- tool_runtime
- vector_io
@ -16,20 +15,6 @@ providers:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: mock-vllm-inference
provider_type: remote::vllm
config:
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
max_tokens: 4096
api_token: fake
tls_verify: false
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -45,11 +30,6 @@ providers:
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
@ -115,14 +95,6 @@ models:
- model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
model_type: llm
- model_id: ${env.MOCK_INFERENCE_MODEL}
provider_id: mock-vllm-inference
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []

View file

@ -2,6 +2,15 @@
## Overview
Agents API for creating and interacting with agentic systems.
Main functionalities provided by this API:
- Create agents with specific instructions and ability to use tools.
- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
- Agents can be provided with various shields (see the Safety API for more details).
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
This section contains documentation for all available providers for the **agents** API.
## Providers

View file

@ -0,0 +1,24 @@
# Batches
## Overview
The Batches API enables efficient processing of multiple requests in a single operation,
particularly useful for processing large datasets, batch evaluation workflows, and
cost-effective inference at scale.
The API is designed to allow use of openai client libraries for seamless integration.
This API provides the following extensions:
- idempotent batch creation
Note: This API is currently under active development and may undergo changes.
This section contains documentation for all available providers for the **batches** API.
## Providers
```{toctree}
:maxdepth: 1
inline_reference
```

View file

@ -0,0 +1,23 @@
# inline::reference
## Description
Reference implementation of batches API with KVStore persistence.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
## Sample Configuration
```yaml
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
```

View file

@ -2,6 +2,8 @@
## Overview
Llama Stack Evaluation API for running evaluations on model and agent candidates.
This section contains documentation for all available providers for the **eval** API.
## Providers

View file

@ -10,4 +10,5 @@ This section contains documentation for all available providers for the **files*
:maxdepth: 1
inline_localfs
remote_s3
```

View file

@ -0,0 +1,33 @@
# remote::s3
## Description
AWS S3-based file storage provider for scalable cloud file management with metadata persistence.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `bucket_name` | `<class 'str'>` | No | | S3 bucket name to store files |
| `region` | `<class 'str'>` | No | us-east-1 | AWS region where the bucket is located |
| `aws_access_key_id` | `str \| None` | No | | AWS access key ID (optional if using IAM roles) |
| `aws_secret_access_key` | `str \| None` | No | | AWS secret access key (optional if using IAM roles) |
| `endpoint_url` | `str \| None` | No | | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
| `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
## Sample Configuration
```yaml
bucket_name: ${env.S3_BUCKET_NAME}
region: ${env.AWS_REGION:=us-east-1}
aws_access_key_id: ${env.AWS_ACCESS_KEY_ID:=}
aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY:=}
endpoint_url: ${env.S3_ENDPOINT_URL:=}
auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/s3_files_metadata.db
```

View file

@ -2,6 +2,12 @@
## Overview
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search.
This section contains documentation for all available providers for the **inference** API.
## Providers

View file

@ -9,7 +9,9 @@ This section contains documentation for all available providers for the **post_t
```{toctree}
:maxdepth: 1
inline_huggingface
inline_torchtune
inline_huggingface-cpu
inline_huggingface-gpu
inline_torchtune-cpu
inline_torchtune-gpu
remote_nvidia
```

View file

@ -0,0 +1,41 @@
# inline::huggingface-cpu
## Description
HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `device` | `<class 'str'>` | No | cuda | |
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
| `chat_template` | `<class 'str'>` | No | <|user|>
{input}
<|assistant|>
{output} | |
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
| `save_total_limit` | `<class 'int'>` | No | 3 | |
| `logging_steps` | `<class 'int'>` | No | 10 | |
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
| `dpo_beta` | `<class 'float'>` | No | 0.1 | |
| `use_reference_model` | `<class 'bool'>` | No | True | |
| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | |
| `dpo_output_dir` | `<class 'str'>` | No | | |
## Sample Configuration
```yaml
checkpoint_format: huggingface
distributed_backend: null
device: cpu
dpo_output_dir: ~/.llama/dummy/dpo_output
```

View file

@ -0,0 +1,41 @@
# inline::huggingface-gpu
## Description
HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `device` | `<class 'str'>` | No | cuda | |
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
| `chat_template` | `<class 'str'>` | No | <|user|>
{input}
<|assistant|>
{output} | |
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
| `save_total_limit` | `<class 'int'>` | No | 3 | |
| `logging_steps` | `<class 'int'>` | No | 10 | |
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
| `dpo_beta` | `<class 'float'>` | No | 0.1 | |
| `use_reference_model` | `<class 'bool'>` | No | True | |
| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | |
| `dpo_output_dir` | `<class 'str'>` | No | | |
## Sample Configuration
```yaml
checkpoint_format: huggingface
distributed_backend: null
device: cpu
dpo_output_dir: ~/.llama/dummy/dpo_output
```

View file

@ -0,0 +1,20 @@
# inline::torchtune-cpu
## Description
TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `torch_seed` | `int \| None` | No | | |
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
## Sample Configuration
```yaml
checkpoint_format: meta
```

View file

@ -0,0 +1,20 @@
# inline::torchtune-gpu
## Description
TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `torch_seed` | `int \| None` | No | | |
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
## Sample Configuration
```yaml
checkpoint_format: meta
```

View file

@ -623,6 +623,62 @@ class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
@json_schema_type
class OpenAIResponseContentPartOutputText(BaseModel):
type: Literal["output_text"] = "output_text"
text: str
# TODO: add annotations, logprobs, etc.
@json_schema_type
class OpenAIResponseContentPartRefusal(BaseModel):
type: Literal["refusal"] = "refusal"
refusal: str
OpenAIResponseContentPart = Annotated[
OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal,
Field(discriminator="type"),
]
register_schema(OpenAIResponseContentPart, name="OpenAIResponseContentPart")
@json_schema_type
class OpenAIResponseObjectStreamResponseContentPartAdded(BaseModel):
"""Streaming event for when a new content part is added to a response item.
:param response_id: Unique identifier of the response containing this content
:param item_id: Unique identifier of the output item containing this content part
:param part: The content part that was added
:param sequence_number: Sequential number for ordering streaming events
:param type: Event type identifier, always "response.content_part.added"
"""
response_id: str
item_id: str
part: OpenAIResponseContentPart
sequence_number: int
type: Literal["response.content_part.added"] = "response.content_part.added"
@json_schema_type
class OpenAIResponseObjectStreamResponseContentPartDone(BaseModel):
"""Streaming event for when a content part is completed.
:param response_id: Unique identifier of the response containing this content
:param item_id: Unique identifier of the output item containing this content part
:param part: The completed content part
:param sequence_number: Sequential number for ordering streaming events
:param type: Event type identifier, always "response.content_part.done"
"""
response_id: str
item_id: str
part: OpenAIResponseContentPart
sequence_number: int
type: Literal["response.content_part.done"] = "response.content_part.done"
OpenAIResponseObjectStream = Annotated[
OpenAIResponseObjectStreamResponseCreated
| OpenAIResponseObjectStreamResponseOutputItemAdded
@ -642,6 +698,8 @@ OpenAIResponseObjectStream = Annotated[
| OpenAIResponseObjectStreamResponseMcpCallInProgress
| OpenAIResponseObjectStreamResponseMcpCallFailed
| OpenAIResponseObjectStreamResponseMcpCallCompleted
| OpenAIResponseObjectStreamResponseContentPartAdded
| OpenAIResponseObjectStreamResponseContentPartDone
| OpenAIResponseObjectStreamResponseCompleted,
Field(discriminator="type"),
]

View file

@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .batches import Batches, BatchObject, ListBatchesResponse
__all__ = ["Batches", "BatchObject", "ListBatchesResponse"]

View file

@ -0,0 +1,95 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Literal, Protocol, runtime_checkable
from pydantic import BaseModel, Field
from llama_stack.schema_utils import json_schema_type, webmethod
try:
from openai.types import Batch as BatchObject
except ImportError as e:
raise ImportError("OpenAI package is required for batches API. Please install it with: pip install openai") from e
@json_schema_type
class ListBatchesResponse(BaseModel):
"""Response containing a list of batch objects."""
object: Literal["list"] = "list"
data: list[BatchObject] = Field(..., description="List of batch objects")
first_id: str | None = Field(default=None, description="ID of the first batch in the list")
last_id: str | None = Field(default=None, description="ID of the last batch in the list")
has_more: bool = Field(default=False, description="Whether there are more batches available")
@runtime_checkable
class Batches(Protocol):
"""
The Batches API enables efficient processing of multiple requests in a single operation,
particularly useful for processing large datasets, batch evaluation workflows, and
cost-effective inference at scale.
The API is designed to allow use of openai client libraries for seamless integration.
This API provides the following extensions:
- idempotent batch creation
Note: This API is currently under active development and may undergo changes.
"""
@webmethod(route="/openai/v1/batches", method="POST")
async def create_batch(
self,
input_file_id: str,
endpoint: str,
completion_window: Literal["24h"],
metadata: dict[str, str] | None = None,
idempotency_key: str | None = None,
) -> BatchObject:
"""Create a new batch for processing multiple API requests.
:param input_file_id: The ID of an uploaded file containing requests for the batch.
:param endpoint: The endpoint to be used for all requests in the batch.
:param completion_window: The time window within which the batch should be processed.
:param metadata: Optional metadata for the batch.
:param idempotency_key: Optional idempotency key. When provided, enables idempotent behavior.
:returns: The created batch object.
"""
...
@webmethod(route="/openai/v1/batches/{batch_id}", method="GET")
async def retrieve_batch(self, batch_id: str) -> BatchObject:
"""Retrieve information about a specific batch.
:param batch_id: The ID of the batch to retrieve.
:returns: The batch object.
"""
...
@webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST")
async def cancel_batch(self, batch_id: str) -> BatchObject:
"""Cancel a batch that is in progress.
:param batch_id: The ID of the batch to cancel.
:returns: The updated batch object.
"""
...
@webmethod(route="/openai/v1/batches", method="GET")
async def list_batches(
self,
after: str | None = None,
limit: int = 20,
) -> ListBatchesResponse:
"""List all batches for the current user.
:param after: A cursor for pagination; returns batches after this batch ID.
:param limit: Number of batches to return (default 20, max 100).
:returns: A list of batch objects.
"""
...

View file

@ -72,3 +72,10 @@ class ModelTypeError(TypeError):
f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
)
super().__init__(message)
class ConflictError(ValueError):
"""raised when an operation cannot be performed due to a conflict with the current state"""
def __init__(self, message: str) -> None:
super().__init__(message)

View file

@ -86,6 +86,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
:cvar inference: Text generation, chat completions, and embeddings
:cvar safety: Content moderation and safety shields
:cvar agents: Agent orchestration and execution
:cvar batches: Batch processing for asynchronous API requests
:cvar vector_io: Vector database operations and queries
:cvar datasetio: Dataset input/output operations
:cvar scoring: Model output evaluation and scoring
@ -108,6 +109,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
inference = "inference"
safety = "safety"
agents = "agents"
batches = "batches"
vector_io = "vector_io"
datasetio = "datasetio"
scoring = "scoring"

View file

@ -22,6 +22,7 @@ class OpenAIFilePurpose(StrEnum):
"""
ASSISTANTS = "assistants"
BATCH = "batch"
# TODO: Add other purposes as needed

View file

@ -473,6 +473,28 @@ class EmbeddingsResponse(BaseModel):
embeddings: list[list[float]]
@json_schema_type
class RerankData(BaseModel):
"""A single rerank result from a reranking response.
:param index: The original index of the document in the input list
:param relevance_score: The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance.
"""
index: int
relevance_score: float
@json_schema_type
class RerankResponse(BaseModel):
"""Response from a reranking request.
:param data: List of rerank result objects, sorted by relevance score (descending)
"""
data: list[RerankData]
@json_schema_type
class OpenAIChatCompletionContentPartTextParam(BaseModel):
"""Text content part for OpenAI-compatible chat completion messages.
@ -1046,6 +1068,7 @@ class InferenceProvider(Protocol):
:returns: A BatchCompletionResponse with the full completions.
"""
raise NotImplementedError("Batch completion is not implemented")
return # this is so mypy's safe-super rule will consider the method concrete
@webmethod(route="/inference/chat-completion", method="POST")
async def chat_completion(
@ -1110,6 +1133,7 @@ class InferenceProvider(Protocol):
:returns: A BatchChatCompletionResponse with the full completions.
"""
raise NotImplementedError("Batch chat completion is not implemented")
return # this is so mypy's safe-super rule will consider the method concrete
@webmethod(route="/inference/embeddings", method="POST")
async def embeddings(
@ -1131,6 +1155,25 @@ class InferenceProvider(Protocol):
"""
...
@webmethod(route="/inference/rerank", method="POST", experimental=True)
async def rerank(
self,
model: str,
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
max_num_results: int | None = None,
) -> RerankResponse:
"""Rerank a list of documents based on their relevance to a query.
:param model: The identifier of the reranking model to use.
:param query: The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length.
:param items: List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length.
:param max_num_results: (Optional) Maximum number of results to return. Default: returns all.
:returns: RerankResponse with indices sorted by relevance score (descending).
"""
raise NotImplementedError("Reranking is not implemented")
return # this is so mypy's safe-super rule will consider the method concrete
@webmethod(route="/openai/v1/completions", method="POST")
async def openai_completion(
self,

View file

@ -386,6 +386,7 @@ class MetricDataPoint(BaseModel):
timestamp: int
value: float
unit: str
@json_schema_type
@ -518,7 +519,7 @@ class Telemetry(Protocol):
metric_name: str,
start_time: int,
end_time: int | None = None,
granularity: str | None = "1d",
granularity: str | None = None,
query_type: MetricQueryType = MetricQueryType.RANGE,
label_matchers: list[MetricLabelMatcher] | None = None,
) -> QueryMetricsResponse:

View file

@ -15,7 +15,7 @@ from llama_stack.log import get_logger
REPO_ROOT = Path(__file__).parent.parent.parent.parent
logger = get_logger(name=__name__, category="server")
logger = get_logger(name=__name__, category="cli")
class StackRun(Subcommand):

View file

@ -5,7 +5,6 @@
# the root directory of this source tree.
import importlib.resources
import logging
import sys
from pydantic import BaseModel
@ -17,9 +16,10 @@ from llama_stack.core.external import load_external_apis
from llama_stack.core.utils.exec import run_command
from llama_stack.core.utils.image_types import LlamaStackImageType
from llama_stack.distributions.template import DistributionTemplate
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import Api
log = logging.getLogger(__name__)
log = get_logger(name=__name__, category="core")
# These are the dependencies needed by the distribution server.
# `llama-stack` is automatically installed by the installation script.

View file

@ -1,207 +0,0 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
PYPI_VERSION=${PYPI_VERSION:-}
# This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
set -euo pipefail
# Define color codes
RED='\033[0;31m'
GREEN='\033[0;32m'
NC='\033[0m' # No Color
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
source "$SCRIPT_DIR/common.sh"
# Usage function
usage() {
echo "Usage: $0 --env-name <conda_env_name> --build-file-path <build_file_path> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
echo "Example: $0 --env-name my-conda-env --build-file-path ./my-stack-build.yaml --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
exit 1
}
# Parse arguments
env_name=""
build_file_path=""
normal_deps=""
external_provider_deps=""
optional_deps=""
while [[ $# -gt 0 ]]; do
key="$1"
case "$key" in
--env-name)
if [[ -z "$2" || "$2" == --* ]]; then
echo "Error: --env-name requires a string value" >&2
usage
fi
env_name="$2"
shift 2
;;
--build-file-path)
if [[ -z "$2" || "$2" == --* ]]; then
echo "Error: --build-file-path requires a string value" >&2
usage
fi
build_file_path="$2"
shift 2
;;
--normal-deps)
if [[ -z "$2" || "$2" == --* ]]; then
echo "Error: --normal-deps requires a string value" >&2
usage
fi
normal_deps="$2"
shift 2
;;
--external-provider-deps)
if [[ -z "$2" || "$2" == --* ]]; then
echo "Error: --external-provider-deps requires a string value" >&2
usage
fi
external_provider_deps="$2"
shift 2
;;
--optional-deps)
if [[ -z "$2" || "$2" == --* ]]; then
echo "Error: --optional-deps requires a string value" >&2
usage
fi
optional_deps="$2"
shift 2
;;
*)
echo "Unknown option: $1" >&2
usage
;;
esac
done
# Check required arguments
if [[ -z "$env_name" || -z "$build_file_path" || -z "$normal_deps" ]]; then
echo "Error: --env-name, --build-file-path, and --normal-deps are required." >&2
usage
fi
if [ -n "$LLAMA_STACK_DIR" ]; then
echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
fi
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
fi
ensure_conda_env_python310() {
# Use only global variables set by flag parser
local python_version="3.12"
if ! is_command_available conda; then
printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
exit 1
fi
if conda env list | grep -q "^${env_name} "; then
printf "Conda environment '${env_name}' exists. Checking Python version...\n"
current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
if [ "$current_version" = "$python_version" ]; then
printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
else
printf "Updating environment '${env_name}' to Python ${python_version}...\n"
conda install -n "${env_name}" python="${python_version}" -y
fi
else
printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
conda create -n "${env_name}" python="${python_version}" -y
fi
eval "$(conda shell.bash hook)"
conda deactivate && conda activate "${env_name}"
"$CONDA_PREFIX"/bin/pip install uv
if [ -n "$TEST_PYPI_VERSION" ]; then
uv pip install fastapi libcst
uv pip install --extra-index-url https://test.pypi.org/simple/ \
llama-stack=="$TEST_PYPI_VERSION" \
"$normal_deps"
if [ -n "$optional_deps" ]; then
IFS='#' read -ra parts <<<"$optional_deps"
for part in "${parts[@]}"; do
echo "$part"
uv pip install $part
done
fi
if [ -n "$external_provider_deps" ]; then
IFS='#' read -ra parts <<<"$external_provider_deps"
for part in "${parts[@]}"; do
echo "$part"
uv pip install "$part"
done
fi
else
if [ -n "$LLAMA_STACK_DIR" ]; then
if [ ! -d "$LLAMA_STACK_DIR" ]; then
printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}\n" >&2
exit 1
fi
printf "Installing from LLAMA_STACK_DIR: $LLAMA_STACK_DIR\n"
uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
else
PYPI_VERSION="${PYPI_VERSION:-}"
if [ -n "$PYPI_VERSION" ]; then
SPEC_VERSION="llama-stack==${PYPI_VERSION}"
else
SPEC_VERSION="llama-stack"
fi
uv pip install --no-cache-dir "$SPEC_VERSION"
fi
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
exit 1
fi
printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
fi
printf "Installing pip dependencies\n"
uv pip install $normal_deps
if [ -n "$optional_deps" ]; then
IFS='#' read -ra parts <<<"$optional_deps"
for part in "${parts[@]}"; do
echo "$part"
uv pip install $part
done
fi
if [ -n "$external_provider_deps" ]; then
IFS='#' read -ra parts <<<"$external_provider_deps"
for part in "${parts[@]}"; do
echo "Getting provider spec for module: $part and installing dependencies"
package_name=$(echo "$part" | sed 's/[<>=!].*//')
python3 -c "
import importlib
import sys
try:
module = importlib.import_module(f'$package_name.provider')
spec = module.get_provider_spec()
if hasattr(spec, 'pip_packages') and spec.pip_packages:
print('\\n'.join(spec.pip_packages))
except Exception as e:
print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
" | uv pip install -r -
done
fi
fi
mv "$build_file_path" "$CONDA_PREFIX"/llamastack-build.yaml
echo "Build spec configuration saved at $CONDA_PREFIX/llamastack-build.yaml"
}
ensure_conda_env_python310 "$env_name" "$build_file_path" "$normal_deps" "$optional_deps" "$external_provider_deps"

View file

@ -151,23 +151,37 @@ run() {
fi
else
if [ -n "$LLAMA_STACK_DIR" ]; then
if [ ! -d "$LLAMA_STACK_DIR" ]; then
# only warn if DIR does not start with "git+"
if [ ! -d "$LLAMA_STACK_DIR" ] && [[ "$LLAMA_STACK_DIR" != git+* ]]; then
printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
exit 1
fi
printf "Installing from LLAMA_STACK_DIR: %s\n" "$LLAMA_STACK_DIR"
uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
# editable only if LLAMA_STACK_DIR does not start with "git+"
if [[ "$LLAMA_STACK_DIR" != git+* ]]; then
EDITABLE="-e"
else
EDITABLE=""
fi
uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_DIR"
else
uv pip install --no-cache-dir llama-stack
fi
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
# only warn if DIR does not start with "git+"
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ] && [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
exit 1
fi
printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
# editable only if LLAMA_STACK_CLIENT_DIR does not start with "git+"
if [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
EDITABLE="-e"
else
EDITABLE=""
fi
uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_CLIENT_DIR"
fi
printf "Installing pip dependencies\n"

View file

@ -3,7 +3,6 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import logging
import textwrap
from typing import Any
@ -21,9 +20,10 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
from llama_stack.core.utils.dynamic import instantiate_class_type
from llama_stack.core.utils.prompt_for_config import prompt_for_config
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import Api, ProviderSpec
logger = logging.getLogger(__name__)
logger = get_logger(name=__name__, category="core")
def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:

View file

@ -318,6 +318,41 @@ class QuotaConfig(BaseModel):
period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
class CORSConfig(BaseModel):
allow_origins: list[str] = Field(default_factory=list)
allow_origin_regex: str | None = Field(default=None)
allow_methods: list[str] = Field(default=["OPTIONS"])
allow_headers: list[str] = Field(default_factory=list)
allow_credentials: bool = Field(default=False)
expose_headers: list[str] = Field(default_factory=list)
max_age: int = Field(default=600, ge=0)
@model_validator(mode="after")
def validate_credentials_config(self) -> Self:
if self.allow_credentials and (self.allow_origins == ["*"] or "*" in self.allow_origins):
raise ValueError("Cannot use wildcard origins with credentials enabled")
return self
def process_cors_config(cors_config: bool | CORSConfig | None) -> CORSConfig | None:
if cors_config is False or cors_config is None:
return None
if cors_config is True:
# dev mode: allow localhost on any port
return CORSConfig(
allow_origins=[],
allow_origin_regex=r"https?://localhost:\d+",
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
allow_headers=["Content-Type", "Authorization", "X-Requested-With"],
)
if isinstance(cors_config, CORSConfig):
return cors_config
raise ValueError(f"Expected bool or CORSConfig, got {type(cors_config).__name__}")
class ServerConfig(BaseModel):
port: int = Field(
default=8321,
@ -349,6 +384,12 @@ class ServerConfig(BaseModel):
default=None,
description="Per client quota request configuration",
)
cors: bool | CORSConfig | None = Field(
default=None,
description="CORS configuration for cross-origin requests. Can be:\n"
"- true: Enable localhost CORS for development\n"
"- {allow_origins: [...], allow_methods: [...], ...}: Full configuration",
)
class StackRunConfig(BaseModel):

View file

@ -7,7 +7,7 @@
import asyncio
import inspect
import json
import logging
import logging # allow-direct-logging
import os
import sys
from concurrent.futures import ThreadPoolExecutor
@ -48,6 +48,7 @@ from llama_stack.core.stack import (
from llama_stack.core.utils.config import redact_sensitive_fields
from llama_stack.core.utils.context import preserve_contexts_async_generator
from llama_stack.core.utils.exec import in_notebook
from llama_stack.log import get_logger
from llama_stack.providers.utils.telemetry.tracing import (
CURRENT_TRACE_CONTEXT,
end_trace,
@ -55,7 +56,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
start_trace,
)
logger = logging.getLogger(__name__)
logger = get_logger(name=__name__, category="core")
T = TypeVar("T")
@ -145,39 +146,26 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
):
super().__init__()
self.async_client = AsyncLlamaStackAsLibraryClient(
config_path_or_distro_name, custom_provider_registry, provider_data
config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal
)
self.pool_executor = ThreadPoolExecutor(max_workers=4)
self.skip_logger_removal = skip_logger_removal
self.provider_data = provider_data
self.loop = asyncio.new_event_loop()
def initialize(self):
if in_notebook():
import nest_asyncio
nest_asyncio.apply()
if not self.skip_logger_removal:
self._remove_root_logger_handlers()
# use a new event loop to avoid interfering with the main event loop
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
return loop.run_until_complete(self.async_client.initialize())
loop.run_until_complete(self.async_client.initialize())
finally:
asyncio.set_event_loop(None)
def _remove_root_logger_handlers(self):
def initialize(self):
"""
Remove all handlers from the root logger. Needed to avoid polluting the console with logs.
Deprecated method for backward compatibility.
"""
root_logger = logging.getLogger()
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
pass
def request(self, *args, **kwargs):
loop = self.loop
@ -215,6 +203,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
config_path_or_distro_name: str,
custom_provider_registry: ProviderRegistry | None = None,
provider_data: dict[str, Any] | None = None,
skip_logger_removal: bool = False,
):
super().__init__()
# when using the library client, we should not log to console since many
@ -222,6 +211,13 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
current_sinks = os.environ.get("TELEMETRY_SINKS", "sqlite").split(",")
os.environ["TELEMETRY_SINKS"] = ",".join(sink for sink in current_sinks if sink != "console")
if in_notebook():
import nest_asyncio
nest_asyncio.apply()
if not skip_logger_removal:
self._remove_root_logger_handlers()
if config_path_or_distro_name.endswith(".yaml"):
config_path = Path(config_path_or_distro_name)
if not config_path.exists():
@ -238,7 +234,24 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
self.provider_data = provider_data
self.route_impls: RouteImpls | None = None # Initialize to None to prevent AttributeError
def _remove_root_logger_handlers(self):
"""
Remove all handlers from the root logger. Needed to avoid polluting the console with logs.
"""
root_logger = logging.getLogger()
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
async def initialize(self) -> bool:
"""
Initialize the async client.
Returns:
bool: True if initialization was successful
"""
try:
self.route_impls = None
self.impls = await construct_stack(self.config, self.custom_provider_registry)

View file

@ -6,15 +6,15 @@
import contextvars
import json
import logging
from contextlib import AbstractContextManager
from typing import Any
from llama_stack.core.datatypes import User
from llama_stack.log import get_logger
from .utils.dynamic import instantiate_class_type
log = logging.getLogger(__name__)
log = get_logger(name=__name__, category="core")
# Context variable for request provider data and auth attributes
PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)

View file

@ -8,6 +8,7 @@ import inspect
from typing import Any
from llama_stack.apis.agents import Agents
from llama_stack.apis.batches import Batches
from llama_stack.apis.benchmarks import Benchmarks
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets
@ -75,6 +76,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
Api.agents: Agents,
Api.inference: Inference,
Api.inspect: Inspect,
Api.batches: Batches,
Api.vector_io: VectorIO,
Api.vector_dbs: VectorDBs,
Api.models: Models,

View file

@ -12,7 +12,7 @@ from llama_stack.apis.datasets import DatasetPurpose, DataSource
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import RoutingTable
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routers")
class DatasetIORouter(DatasetIO):

View file

@ -16,7 +16,7 @@ from llama_stack.apis.scoring import (
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import RoutingTable
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routers")
class ScoringRouter(Scoring):

View file

@ -65,7 +65,7 @@ from llama_stack.providers.datatypes import HealthResponse, HealthStatus, Routin
from llama_stack.providers.utils.inference.inference_store import InferenceStore
from llama_stack.providers.utils.telemetry.tracing import get_current_span
logger = get_logger(name=__name__, category="inference")
logger = get_logger(name=__name__, category="core::routers")
class InferenceRouter(Inference):

View file

@ -6,16 +6,14 @@
from typing import Any
from llama_stack.apis.inference import (
Message,
)
from llama_stack.apis.inference import Message
from llama_stack.apis.safety import RunShieldResponse, Safety
from llama_stack.apis.safety.safety import ModerationObject
from llama_stack.apis.shields import Shield
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import RoutingTable
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routers")
class SafetyRouter(Safety):
@ -68,6 +66,7 @@ class SafetyRouter(Safety):
list_shields_response = await self.routing_table.list_shields()
matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id]
if not matches:
raise ValueError(f"No shield associated with provider_resource id {model}")
if len(matches) > 1:

View file

@ -22,7 +22,7 @@ from llama_stack.log import get_logger
from ..routing_tables.toolgroups import ToolGroupsRoutingTable
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routers")
class ToolRuntimeRouter(ToolRuntime):

View file

@ -30,7 +30,7 @@ from llama_stack.apis.vector_io import (
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routers")
class VectorIORouter(VectorIO):

View file

@ -14,7 +14,7 @@ from llama_stack.log import get_logger
from .common import CommonRoutingTableImpl
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routing_tables")
class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):

View file

@ -23,7 +23,7 @@ from llama_stack.core.store import DistributionRegistry
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import Api, RoutingTable
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routing_tables")
def get_impl_api(p: Any) -> Api:

View file

@ -26,7 +26,7 @@ from llama_stack.log import get_logger
from .common import CommonRoutingTableImpl
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routing_tables")
class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):

View file

@ -17,7 +17,7 @@ from llama_stack.log import get_logger
from .common import CommonRoutingTableImpl, lookup_model
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routing_tables")
class ModelsRoutingTable(CommonRoutingTableImpl, Models):

View file

@ -19,7 +19,7 @@ from llama_stack.log import get_logger
from .common import CommonRoutingTableImpl
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routing_tables")
class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):

View file

@ -15,7 +15,7 @@ from llama_stack.log import get_logger
from .common import CommonRoutingTableImpl
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routing_tables")
class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):

View file

@ -14,7 +14,7 @@ from llama_stack.log import get_logger
from .common import CommonRoutingTableImpl
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routing_tables")
def parse_toolgroup_from_toolgroup_name_pair(toolgroup_name_with_maybe_tool_name: str) -> str | None:

View file

@ -30,7 +30,7 @@ from llama_stack.log import get_logger
from .common import CommonRoutingTableImpl, lookup_model
logger = get_logger(name=__name__, category="core")
logger = get_logger(name=__name__, category="core::routing_tables")
class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):

View file

@ -15,7 +15,7 @@ from llama_stack.core.server.auth_providers import create_auth_provider
from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
from llama_stack.log import get_logger
logger = get_logger(name=__name__, category="auth")
logger = get_logger(name=__name__, category="core::auth")
class AuthenticationMiddleware:

View file

@ -23,7 +23,7 @@ from llama_stack.core.datatypes import (
)
from llama_stack.log import get_logger
logger = get_logger(name=__name__, category="auth")
logger = get_logger(name=__name__, category="core::auth")
class AuthResponse(BaseModel):

View file

@ -15,7 +15,7 @@ from llama_stack.providers.utils.kvstore.api import KVStore
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
logger = get_logger(name=__name__, category="quota")
logger = get_logger(name=__name__, category="core::server")
class QuotaMiddleware:

View file

@ -9,7 +9,7 @@ import asyncio
import functools
import inspect
import json
import logging
import logging # allow-direct-logging
import os
import ssl
import sys
@ -28,10 +28,12 @@ from aiohttp import hdrs
from fastapi import Body, FastAPI, HTTPException, Request, Response
from fastapi import Path as FastapiPath
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from openai import BadRequestError
from pydantic import BaseModel, ValidationError
from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
from llama_stack.apis.common.responses import PaginatedResponse
from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
from llama_stack.core.access_control.access_control import AccessDeniedError
@ -39,6 +41,7 @@ from llama_stack.core.datatypes import (
AuthenticationRequiredError,
LoggingConfig,
StackRunConfig,
process_cors_config,
)
from llama_stack.core.distribution import builtin_automatically_routed_apis
from llama_stack.core.external import ExternalApiSpec, load_external_apis
@ -81,7 +84,7 @@ from .quota import QuotaMiddleware
REPO_ROOT = Path(__file__).parent.parent.parent.parent
logger = get_logger(name=__name__, category="server")
logger = get_logger(name=__name__, category="core::server")
def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
@ -128,6 +131,10 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
]
},
)
elif isinstance(exc, ConflictError):
return HTTPException(status_code=409, detail=str(exc))
elif isinstance(exc, ResourceNotFoundError):
return HTTPException(status_code=404, detail=str(exc))
elif isinstance(exc, ValueError):
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
elif isinstance(exc, BadRequestError):
@ -408,7 +415,7 @@ def main(args: argparse.Namespace | None = None):
config_contents = yaml.safe_load(fp)
if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
logger_config = LoggingConfig(**cfg)
logger = get_logger(name=__name__, category="server", config=logger_config)
logger = get_logger(name=__name__, category="core::server", config=logger_config)
if args.env:
for env_pair in args.env:
try:
@ -478,6 +485,12 @@ def main(args: argparse.Namespace | None = None):
window_seconds=window_seconds,
)
if config.server.cors:
logger.info("Enabling CORS")
cors_config = process_cors_config(config.server.cors)
if cors_config:
app.add_middleware(CORSMiddleware, **cors_config.model_dump())
if Api.telemetry in impls:
setup_logger(impls[Api.telemetry])
else:

View file

@ -16,7 +16,7 @@ from llama_stack.log import get_logger
from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
logger = get_logger(__name__, category="core")
logger = get_logger(__name__, category="core::registry")
class DistributionRegistry(Protocol):

View file

@ -10,7 +10,7 @@ from pathlib import Path
from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
from llama_stack.log import get_logger
logger = get_logger(name=__name__, category="config_resolution")
logger = get_logger(name=__name__, category="core")
DISTRO_DIR = Path(__file__).parent.parent.parent.parent / "llama_stack" / "distributions"

View file

@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import logging
import importlib
import os
import signal
import subprocess
@ -12,9 +12,9 @@ import sys
from termcolor import cprint
log = logging.getLogger(__name__)
from llama_stack.log import get_logger
import importlib
log = get_logger(name=__name__, category="core")
def formulate_run_args(image_type: str, image_name: str) -> list:

View file

@ -6,7 +6,6 @@
import inspect
import json
import logging
from enum import Enum
from typing import Annotated, Any, Literal, Union, get_args, get_origin
@ -14,7 +13,9 @@ from pydantic import BaseModel
from pydantic.fields import FieldInfo
from pydantic_core import PydanticUndefinedType
log = logging.getLogger(__name__)
from llama_stack.log import get_logger
log = get_logger(name=__name__, category="core")
def is_list_of_primitives(field_type):

View file

@ -28,12 +28,13 @@ distribution_spec:
- provider_type: inline::localfs
safety:
- provider_type: inline::llama-guard
- provider_type: inline::code-scanner
agents:
- provider_type: inline::meta-reference
telemetry:
- provider_type: inline::meta-reference
post_training:
- provider_type: inline::huggingface
- provider_type: inline::huggingface-cpu
eval:
- provider_type: inline::meta-reference
datasetio:
@ -48,6 +49,8 @@ distribution_spec:
- provider_type: remote::tavily-search
- provider_type: inline::rag-runtime
- provider_type: remote::model-context-protocol
batches:
- provider_type: inline::reference
image_type: venv
additional_pip_packages:
- aiosqlite

View file

@ -2,6 +2,7 @@ version: 2
image_name: ci-tests
apis:
- agents
- batches
- datasetio
- eval
- files
@ -134,6 +135,8 @@ providers:
provider_type: inline::llama-guard
config:
excluded_categories: []
- provider_id: code-scanner
provider_type: inline::code-scanner
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
@ -153,8 +156,8 @@ providers:
sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
post_training:
- provider_id: huggingface
provider_type: inline::huggingface
- provider_id: huggingface-cpu
provider_type: inline::huggingface-cpu
config:
checkpoint_format: huggingface
distributed_backend: null
@ -204,6 +207,13 @@ providers:
provider_type: inline::rag-runtime
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
batches:
- provider_id: reference
provider_type: inline::reference
config:
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/batches.db
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/registry.db
@ -215,6 +225,9 @@ shields:
- shield_id: llama-guard
provider_id: ${env.SAFETY_MODEL:+llama-guard}
provider_shield_id: ${env.SAFETY_MODEL:=}
- shield_id: code-scanner
provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
vector_dbs: []
datasets: []
scoring_fns: []

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .starter_gpu import get_distribution_template # noqa: F401

View file

@ -0,0 +1,59 @@
version: 2
distribution_spec:
description: Quick start template for running Llama Stack with several popular providers.
This distribution is intended for GPU-enabled environments.
providers:
inference:
- provider_type: remote::cerebras
- provider_type: remote::ollama
- provider_type: remote::vllm
- provider_type: remote::tgi
- provider_type: remote::fireworks
- provider_type: remote::together
- provider_type: remote::bedrock
- provider_type: remote::nvidia
- provider_type: remote::openai
- provider_type: remote::anthropic
- provider_type: remote::gemini
- provider_type: remote::vertexai
- provider_type: remote::groq
- provider_type: remote::sambanova
- provider_type: inline::sentence-transformers
vector_io:
- provider_type: inline::faiss
- provider_type: inline::sqlite-vec
- provider_type: inline::milvus
- provider_type: remote::chromadb
- provider_type: remote::pgvector
files:
- provider_type: inline::localfs
safety:
- provider_type: inline::llama-guard
- provider_type: inline::code-scanner
agents:
- provider_type: inline::meta-reference
telemetry:
- provider_type: inline::meta-reference
post_training:
- provider_type: inline::torchtune-gpu
eval:
- provider_type: inline::meta-reference
datasetio:
- provider_type: remote::huggingface
- provider_type: inline::localfs
scoring:
- provider_type: inline::basic
- provider_type: inline::llm-as-judge
- provider_type: inline::braintrust
tool_runtime:
- provider_type: remote::brave-search
- provider_type: remote::tavily-search
- provider_type: inline::rag-runtime
- provider_type: remote::model-context-protocol
batches:
- provider_type: inline::reference
image_type: venv
additional_pip_packages:
- aiosqlite
- asyncpg
- sqlalchemy[asyncio]

View file

@ -0,0 +1,238 @@
version: 2
image_name: starter-gpu
apis:
- agents
- batches
- datasetio
- eval
- files
- inference
- post_training
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
providers:
inference:
- provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
provider_type: remote::cerebras
config:
base_url: https://api.cerebras.ai
api_key: ${env.CEREBRAS_API_KEY:=}
- provider_id: ${env.OLLAMA_URL:+ollama}
provider_type: remote::ollama
config:
url: ${env.OLLAMA_URL:=http://localhost:11434}
- provider_id: ${env.VLLM_URL:+vllm}
provider_type: remote::vllm
config:
url: ${env.VLLM_URL:=}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: ${env.TGI_URL:+tgi}
provider_type: remote::tgi
config:
url: ${env.TGI_URL:=}
- provider_id: fireworks
provider_type: remote::fireworks
config:
url: https://api.fireworks.ai/inference/v1
api_key: ${env.FIREWORKS_API_KEY:=}
- provider_id: together
provider_type: remote::together
config:
url: https://api.together.xyz/v1
api_key: ${env.TOGETHER_API_KEY:=}
- provider_id: bedrock
provider_type: remote::bedrock
- provider_id: ${env.NVIDIA_API_KEY:+nvidia}
provider_type: remote::nvidia
config:
url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
api_key: ${env.NVIDIA_API_KEY:=}
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
- provider_id: openai
provider_type: remote::openai
config:
api_key: ${env.OPENAI_API_KEY:=}
base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
- provider_id: anthropic
provider_type: remote::anthropic
config:
api_key: ${env.ANTHROPIC_API_KEY:=}
- provider_id: gemini
provider_type: remote::gemini
config:
api_key: ${env.GEMINI_API_KEY:=}
- provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
provider_type: remote::vertexai
config:
project: ${env.VERTEX_AI_PROJECT:=}
location: ${env.VERTEX_AI_LOCATION:=us-central1}
- provider_id: groq
provider_type: remote::groq
config:
url: https://api.groq.com
api_key: ${env.GROQ_API_KEY:=}
- provider_id: sambanova
provider_type: remote::sambanova
config:
url: https://api.sambanova.ai/v1
api_key: ${env.SAMBANOVA_API_KEY:=}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
- provider_id: sqlite-vec
provider_type: inline::sqlite-vec
config:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
- provider_id: ${env.MILVUS_URL:+milvus}
provider_type: inline::milvus
config:
db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
- provider_id: ${env.CHROMADB_URL:+chromadb}
provider_type: remote::chromadb
config:
url: ${env.CHROMADB_URL:=}
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
- provider_id: ${env.PGVECTOR_DB:+pgvector}
provider_type: remote::pgvector
config:
host: ${env.PGVECTOR_HOST:=localhost}
port: ${env.PGVECTOR_PORT:=5432}
db: ${env.PGVECTOR_DB:=}
user: ${env.PGVECTOR_USER:=}
password: ${env.PGVECTOR_PASSWORD:=}
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
- provider_id: code-scanner
provider_type: inline::code-scanner
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/agents_store.db
responses_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/responses_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/trace_store.db
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
post_training:
- provider_id: torchtune-gpu
provider_type: inline::torchtune-gpu
config:
checkpoint_format: meta
eval:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/meta_reference_eval.db
datasetio:
- provider_id: huggingface
provider_type: remote::huggingface
config:
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/huggingface_datasetio.db
- provider_id: localfs
provider_type: inline::localfs
config:
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/localfs_datasetio.db
scoring:
- provider_id: basic
provider_type: inline::basic
- provider_id: llm-as-judge
provider_type: inline::llm-as-judge
- provider_id: braintrust
provider_type: inline::braintrust
config:
openai_api_key: ${env.OPENAI_API_KEY:=}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:=}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:=}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
batches:
- provider_id: reference
provider_type: inline::reference
config:
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/batches.db
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/registry.db
inference_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/inference_store.db
models: []
shields:
- shield_id: llama-guard
provider_id: ${env.SAFETY_MODEL:+llama-guard}
provider_shield_id: ${env.SAFETY_MODEL:=}
- shield_id: code-scanner
provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
server:
port: 8321

Some files were not shown because too many files have changed in this diff Show more