mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 20:14:13 +00:00
Merge branch 'main' into content-extension
This commit is contained in:
commit
3e11e1472c
334 changed files with 22841 additions and 8940 deletions
2
.github/TRIAGERS.md
vendored
2
.github/TRIAGERS.md
vendored
|
@ -1,2 +1,2 @@
|
||||||
# This file documents Triage members in the Llama Stack community
|
# This file documents Triage members in the Llama Stack community
|
||||||
@bbrowning @franciscojavierarceo @leseb
|
@franciscojavierarceo
|
||||||
|
|
22
.github/actions/run-and-record-tests/action.yml
vendored
22
.github/actions/run-and-record-tests/action.yml
vendored
|
@ -2,9 +2,13 @@ name: 'Run and Record Tests'
|
||||||
description: 'Run integration tests and handle recording/artifact upload'
|
description: 'Run integration tests and handle recording/artifact upload'
|
||||||
|
|
||||||
inputs:
|
inputs:
|
||||||
test-types:
|
test-subdirs:
|
||||||
description: 'JSON array of test types to run'
|
description: 'Comma-separated list of test subdirectories to run'
|
||||||
required: true
|
required: true
|
||||||
|
test-pattern:
|
||||||
|
description: 'Regex pattern to pass to pytest -k'
|
||||||
|
required: false
|
||||||
|
default: ''
|
||||||
stack-config:
|
stack-config:
|
||||||
description: 'Stack configuration to use'
|
description: 'Stack configuration to use'
|
||||||
required: true
|
required: true
|
||||||
|
@ -32,12 +36,14 @@ runs:
|
||||||
- name: Run Integration Tests
|
- name: Run Integration Tests
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
./scripts/integration-tests.sh \
|
uv run --no-sync ./scripts/integration-tests.sh \
|
||||||
--stack-config '${{ inputs.stack-config }}' \
|
--stack-config '${{ inputs.stack-config }}' \
|
||||||
--provider '${{ inputs.provider }}' \
|
--provider '${{ inputs.provider }}' \
|
||||||
--test-types '${{ inputs.test-types }}' \
|
--test-subdirs '${{ inputs.test-subdirs }}' \
|
||||||
|
--test-pattern '${{ inputs.test-pattern }}' \
|
||||||
--inference-mode '${{ inputs.inference-mode }}' \
|
--inference-mode '${{ inputs.inference-mode }}' \
|
||||||
${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }}
|
${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
|
||||||
|
| tee pytest-${{ inputs.inference-mode }}.log
|
||||||
|
|
||||||
|
|
||||||
- name: Commit and push recordings
|
- name: Commit and push recordings
|
||||||
|
@ -57,10 +63,10 @@ runs:
|
||||||
git commit -m "Recordings update from CI"
|
git commit -m "Recordings update from CI"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
git fetch origin ${{ github.event.pull_request.head.ref }}
|
git fetch origin ${{ github.ref_name }}
|
||||||
git rebase origin/${{ github.event.pull_request.head.ref }}
|
git rebase origin/${{ github.ref_name }}
|
||||||
echo "Rebased successfully"
|
echo "Rebased successfully"
|
||||||
git push origin HEAD:${{ github.event.pull_request.head.ref }}
|
git push origin HEAD:${{ github.ref_name }}
|
||||||
echo "Pushed successfully"
|
echo "Pushed successfully"
|
||||||
else
|
else
|
||||||
echo "No recording changes"
|
echo "No recording changes"
|
||||||
|
|
9
.github/actions/setup-runner/action.yml
vendored
9
.github/actions/setup-runner/action.yml
vendored
|
@ -16,14 +16,16 @@ runs:
|
||||||
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
|
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
|
||||||
with:
|
with:
|
||||||
python-version: ${{ inputs.python-version }}
|
python-version: ${{ inputs.python-version }}
|
||||||
activate-environment: true
|
|
||||||
version: 0.7.6
|
version: 0.7.6
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
|
echo "Updating project dependencies via uv sync"
|
||||||
uv sync --all-groups
|
uv sync --all-groups
|
||||||
uv pip install ollama faiss-cpu
|
|
||||||
|
echo "Installing ad-hoc dependencies"
|
||||||
|
uv pip install faiss-cpu
|
||||||
|
|
||||||
# Install llama-stack-client-python based on the client-version input
|
# Install llama-stack-client-python based on the client-version input
|
||||||
if [ "${{ inputs.client-version }}" = "latest" ]; then
|
if [ "${{ inputs.client-version }}" = "latest" ]; then
|
||||||
|
@ -37,4 +39,5 @@ runs:
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
uv pip install -e .
|
echo "Installed llama packages"
|
||||||
|
uv pip list | grep llama
|
||||||
|
|
|
@ -53,7 +53,22 @@ runs:
|
||||||
- name: Build Llama Stack
|
- name: Build Llama Stack
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
uv run llama stack build --template ci-tests --image-type venv
|
# Install llama-stack-client-python based on the client-version input
|
||||||
|
if [ "${{ inputs.client-version }}" = "latest" ]; then
|
||||||
|
echo "Installing latest llama-stack-client-python from main branch"
|
||||||
|
export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
|
||||||
|
elif [ "${{ inputs.client-version }}" = "published" ]; then
|
||||||
|
echo "Installing published llama-stack-client-python from PyPI"
|
||||||
|
unset LLAMA_STACK_CLIENT_DIR
|
||||||
|
else
|
||||||
|
echo "Invalid client-version: ${{ inputs.client-version }}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Building Llama Stack"
|
||||||
|
|
||||||
|
LLAMA_STACK_DIR=. \
|
||||||
|
uv run --no-sync llama stack build --template ci-tests --image-type venv
|
||||||
|
|
||||||
- name: Configure git for commits
|
- name: Configure git for commits
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
12
.github/dependabot.yml
vendored
12
.github/dependabot.yml
vendored
|
@ -9,6 +9,7 @@ updates:
|
||||||
day: "saturday"
|
day: "saturday"
|
||||||
commit-message:
|
commit-message:
|
||||||
prefix: chore(github-deps)
|
prefix: chore(github-deps)
|
||||||
|
|
||||||
- package-ecosystem: "uv"
|
- package-ecosystem: "uv"
|
||||||
directory: "/"
|
directory: "/"
|
||||||
schedule:
|
schedule:
|
||||||
|
@ -19,3 +20,14 @@ updates:
|
||||||
- python
|
- python
|
||||||
commit-message:
|
commit-message:
|
||||||
prefix: chore(python-deps)
|
prefix: chore(python-deps)
|
||||||
|
|
||||||
|
- package-ecosystem: npm
|
||||||
|
directory: "/llama_stack/ui"
|
||||||
|
schedule:
|
||||||
|
interval: "weekly"
|
||||||
|
day: "saturday"
|
||||||
|
labels:
|
||||||
|
- type/dependencies
|
||||||
|
- javascript
|
||||||
|
commit-message:
|
||||||
|
prefix: chore(ui-deps)
|
||||||
|
|
1
.github/workflows/README.md
vendored
1
.github/workflows/README.md
vendored
|
@ -18,5 +18,6 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
|
||||||
| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
|
| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
|
||||||
| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
|
| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
|
||||||
| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
|
| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
|
||||||
|
| UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
|
||||||
| Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
|
| Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
|
||||||
| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
|
| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
|
||||||
|
|
2
.github/workflows/changelog.yml
vendored
2
.github/workflows/changelog.yml
vendored
|
@ -17,7 +17,7 @@ jobs:
|
||||||
pull-requests: write # for peter-evans/create-pull-request to create a PR
|
pull-requests: write # for peter-evans/create-pull-request to create a PR
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
with:
|
with:
|
||||||
ref: main
|
ref: main
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
7
.github/workflows/install-script-ci.yml
vendored
7
.github/workflows/install-script-ci.yml
vendored
|
@ -16,21 +16,22 @@ jobs:
|
||||||
lint:
|
lint:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
|
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
|
||||||
- name: Run ShellCheck on install.sh
|
- name: Run ShellCheck on install.sh
|
||||||
run: shellcheck scripts/install.sh
|
run: shellcheck scripts/install.sh
|
||||||
smoke-test-on-dev:
|
smoke-test-on-dev:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
||||||
- name: Build a single provider
|
- name: Build a single provider
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template starter --image-type container --image-name test
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
|
||||||
|
llama stack build --template starter --image-type container --image-name test
|
||||||
|
|
||||||
- name: Run installer end-to-end
|
- name: Run installer end-to-end
|
||||||
run: |
|
run: |
|
||||||
|
|
5
.github/workflows/integration-auth-tests.yml
vendored
5
.github/workflows/integration-auth-tests.yml
vendored
|
@ -10,6 +10,7 @@ on:
|
||||||
paths:
|
paths:
|
||||||
- 'distributions/**'
|
- 'distributions/**'
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
|
- '!llama_stack/ui/**'
|
||||||
- 'tests/integration/**'
|
- 'tests/integration/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -17,7 +18,7 @@ on:
|
||||||
- '.github/workflows/integration-auth-tests.yml' # This workflow
|
- '.github/workflows/integration-auth-tests.yml' # This workflow
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -30,7 +31,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
|
@ -16,7 +16,7 @@ on:
|
||||||
- '.github/workflows/integration-sql-store-tests.yml' # This workflow
|
- '.github/workflows/integration-sql-store-tests.yml' # This workflow
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -44,7 +44,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
38
.github/workflows/integration-tests.yml
vendored
38
.github/workflows/integration-tests.yml
vendored
|
@ -5,11 +5,12 @@ run-name: Run the integration test suite from tests/integration in replay mode
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
pull_request_target:
|
pull_request:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
|
- '!llama_stack/ui/**'
|
||||||
- 'tests/**'
|
- 'tests/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -31,35 +32,23 @@ on:
|
||||||
description: 'Test against a specific provider'
|
description: 'Test against a specific provider'
|
||||||
type: string
|
type: string
|
||||||
default: 'ollama'
|
default: 'ollama'
|
||||||
|
test-subdirs:
|
||||||
|
description: 'Comma-separated list of test subdirectories to run'
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
|
test-pattern:
|
||||||
|
description: 'Regex pattern to pass to pytest -k'
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
# Skip concurrency for pushes to main - each commit should be tested independently
|
# Skip concurrency for pushes to main - each commit should be tested independently
|
||||||
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number }}
|
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
discover-tests:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
outputs:
|
|
||||||
test-types: ${{ steps.generate-test-types.outputs.test-types }}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
|
|
||||||
- name: Generate test types
|
|
||||||
id: generate-test-types
|
|
||||||
run: |
|
|
||||||
# Get test directories dynamically, excluding non-test directories
|
|
||||||
# NOTE: we are excluding post_training since the tests take too long
|
|
||||||
TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
|
|
||||||
sed 's|tests/integration/||' |
|
|
||||||
grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
|
|
||||||
sort | jq -R -s -c 'split("\n")[:-1]')
|
|
||||||
echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
run-replay-mode-tests:
|
run-replay-mode-tests:
|
||||||
needs: discover-tests
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
|
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
|
||||||
|
|
||||||
|
@ -76,7 +65,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Setup test environment
|
- name: Setup test environment
|
||||||
uses: ./.github/actions/setup-test-environment
|
uses: ./.github/actions/setup-test-environment
|
||||||
|
@ -90,7 +79,8 @@ jobs:
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
uses: ./.github/actions/run-and-record-tests
|
uses: ./.github/actions/run-and-record-tests
|
||||||
with:
|
with:
|
||||||
test-types: ${{ needs.discover-tests.outputs.test-types }}
|
test-subdirs: ${{ inputs.test-subdirs }}
|
||||||
|
test-pattern: ${{ inputs.test-pattern }}
|
||||||
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
|
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
|
||||||
provider: ${{ matrix.provider }}
|
provider: ${{ matrix.provider }}
|
||||||
inference-mode: 'replay'
|
inference-mode: 'replay'
|
||||||
|
|
|
@ -9,14 +9,17 @@ on:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
|
- '!llama_stack/ui/**'
|
||||||
- 'tests/integration/vector_io/**'
|
- 'tests/integration/vector_io/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
- 'requirements.txt'
|
- 'requirements.txt'
|
||||||
- '.github/workflows/integration-vector-io-tests.yml' # This workflow
|
- '.github/workflows/integration-vector-io-tests.yml' # This workflow
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * *' # (test on python 3.13) Daily at 12 AM UTC
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -25,12 +28,12 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
|
vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
|
||||||
python-version: ["3.12", "3.13"]
|
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
|
||||||
fail-fast: false # we want to run all tests regardless of failure
|
fail-fast: false # we want to run all tests regardless of failure
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -141,7 +144,7 @@ jobs:
|
||||||
|
|
||||||
- name: Build Llama Stack
|
- name: Build Llama Stack
|
||||||
run: |
|
run: |
|
||||||
uv run llama stack build --template ci-tests --image-type venv
|
uv run --no-sync llama stack build --template ci-tests --image-type venv
|
||||||
|
|
||||||
- name: Check Storage and Memory Available Before Tests
|
- name: Check Storage and Memory Available Before Tests
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
|
@ -164,7 +167,8 @@ jobs:
|
||||||
ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
|
ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
|
||||||
WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
|
WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
|
||||||
run: |
|
run: |
|
||||||
uv run pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
|
uv run --no-sync \
|
||||||
|
pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
|
||||||
tests/integration/vector_io \
|
tests/integration/vector_io \
|
||||||
--embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
|
--embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
|
||||||
|
|
||||||
|
|
15
.github/workflows/pre-commit.yml
vendored
15
.github/workflows/pre-commit.yml
vendored
|
@ -8,7 +8,7 @@ on:
|
||||||
branches: [main]
|
branches: [main]
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -20,7 +20,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
with:
|
with:
|
||||||
# For dependabot PRs, we need to checkout with a token that can push changes
|
# For dependabot PRs, we need to checkout with a token that can push changes
|
||||||
token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
|
token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
|
||||||
|
@ -36,6 +36,17 @@ jobs:
|
||||||
**/requirements*.txt
|
**/requirements*.txt
|
||||||
.pre-commit-config.yaml
|
.pre-commit-config.yaml
|
||||||
|
|
||||||
|
- name: Set up Node.js
|
||||||
|
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
|
||||||
|
with:
|
||||||
|
node-version: '20'
|
||||||
|
cache: 'npm'
|
||||||
|
cache-dependency-path: 'llama_stack/ui/'
|
||||||
|
|
||||||
|
- name: Install npm dependencies
|
||||||
|
run: npm ci
|
||||||
|
working-directory: llama_stack/ui
|
||||||
|
|
||||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
env:
|
env:
|
||||||
|
|
20
.github/workflows/providers-build.yml
vendored
20
.github/workflows/providers-build.yml
vendored
|
@ -26,7 +26,7 @@ on:
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -36,7 +36,7 @@ jobs:
|
||||||
distros: ${{ steps.set-matrix.outputs.distros }}
|
distros: ${{ steps.set-matrix.outputs.distros }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Generate Distribution List
|
- name: Generate Distribution List
|
||||||
id: set-matrix
|
id: set-matrix
|
||||||
|
@ -55,7 +55,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -79,7 +79,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -92,7 +92,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -106,6 +106,10 @@ jobs:
|
||||||
- name: Inspect the container image entrypoint
|
- name: Inspect the container image entrypoint
|
||||||
run: |
|
run: |
|
||||||
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
||||||
|
if [ -z "$IMAGE_ID" ]; then
|
||||||
|
echo "No image found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
||||||
echo "Entrypoint: $entrypoint"
|
echo "Entrypoint: $entrypoint"
|
||||||
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
|
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
|
||||||
|
@ -117,7 +121,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -140,6 +144,10 @@ jobs:
|
||||||
- name: Inspect UBI9 image
|
- name: Inspect UBI9 image
|
||||||
run: |
|
run: |
|
||||||
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
||||||
|
if [ -z "$IMAGE_ID" ]; then
|
||||||
|
echo "No image found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
||||||
echo "Entrypoint: $entrypoint"
|
echo "Entrypoint: $entrypoint"
|
||||||
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
|
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
|
||||||
|
|
6
.github/workflows/python-build-test.yml
vendored
6
.github/workflows/python-build-test.yml
vendored
|
@ -9,6 +9,8 @@ on:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
|
paths-ignore:
|
||||||
|
- 'llama_stack/ui/**'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
@ -19,10 +21,10 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install uv
|
- name: Install uv
|
||||||
uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
|
uses: astral-sh/setup-uv@4959332f0f014c5280e7eac8b70c90cb574c9f9b # v6.6.0
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
activate-environment: true
|
activate-environment: true
|
||||||
|
|
103
.github/workflows/record-integration-tests.yml
vendored
103
.github/workflows/record-integration-tests.yml
vendored
|
@ -1,93 +1,53 @@
|
||||||
|
# This workflow should be run manually when needing to re-record tests. This happens when you have
|
||||||
|
# - added a new test
|
||||||
|
# - or changed an existing test such that a new inference call is made
|
||||||
|
# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the
|
||||||
|
# tests and commit the recordings to the PR branch.
|
||||||
name: Integration Tests (Record)
|
name: Integration Tests (Record)
|
||||||
|
|
||||||
run-name: Run the integration test suite from tests/integration
|
run-name: Run the integration test suite from tests/integration
|
||||||
|
|
||||||
on:
|
on:
|
||||||
pull_request:
|
|
||||||
branches: [ main ]
|
|
||||||
types: [opened, synchronize, labeled]
|
|
||||||
paths:
|
|
||||||
- 'llama_stack/**'
|
|
||||||
- 'tests/**'
|
|
||||||
- 'uv.lock'
|
|
||||||
- 'pyproject.toml'
|
|
||||||
- '.github/workflows/record-integration-tests.yml' # This workflow
|
|
||||||
- '.github/actions/setup-ollama/action.yml'
|
|
||||||
- '.github/actions/setup-test-environment/action.yml'
|
|
||||||
- '.github/actions/run-and-record-tests/action.yml'
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
|
test-subdirs:
|
||||||
|
description: 'Comma-separated list of test subdirectories to run'
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
test-provider:
|
test-provider:
|
||||||
description: 'Test against a specific provider'
|
description: 'Test against a specific provider'
|
||||||
type: string
|
type: string
|
||||||
default: 'ollama'
|
default: 'ollama'
|
||||||
|
run-vision-tests:
|
||||||
concurrency:
|
description: 'Whether to run vision tests'
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
type: boolean
|
||||||
cancel-in-progress: true
|
default: false
|
||||||
|
test-pattern:
|
||||||
|
description: 'Regex pattern to pass to pytest -k'
|
||||||
|
type: string
|
||||||
|
default: ''
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
discover-tests:
|
|
||||||
if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
|
|
||||||
contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
outputs:
|
|
||||||
test-types: ${{ steps.generate-test-types.outputs.test-types }}
|
|
||||||
matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
|
|
||||||
- name: Generate test types
|
|
||||||
id: generate-test-types
|
|
||||||
run: |
|
|
||||||
# Get test directories dynamically, excluding non-test directories
|
|
||||||
TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
|
|
||||||
grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
|
|
||||||
sort | jq -R -s -c 'split("\n")[:-1]')
|
|
||||||
echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
|
|
||||||
echo "labels=$labels"
|
|
||||||
|
|
||||||
modes_array=()
|
|
||||||
if [[ $labels == *"re-record-vision-tests"* ]]; then
|
|
||||||
modes_array+=("vision")
|
|
||||||
fi
|
|
||||||
if [[ $labels == *"re-record-tests"* ]]; then
|
|
||||||
modes_array+=("non-vision")
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Convert to JSON array
|
|
||||||
if [ ${#modes_array[@]} -eq 0 ]; then
|
|
||||||
matrix_modes="[]"
|
|
||||||
else
|
|
||||||
matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]')
|
|
||||||
fi
|
|
||||||
echo "matrix_modes=$matrix_modes"
|
|
||||||
echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
env:
|
|
||||||
GH_TOKEN: ${{ github.token }}
|
|
||||||
|
|
||||||
record-tests:
|
record-tests:
|
||||||
needs: discover-tests
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: write
|
||||||
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
- name: Echo workflow inputs
|
||||||
|
run: |
|
||||||
|
echo "::group::Workflow Inputs"
|
||||||
|
echo "test-subdirs: ${{ inputs.test-subdirs }}"
|
||||||
|
echo "test-provider: ${{ inputs.test-provider }}"
|
||||||
|
echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
|
||||||
|
echo "test-pattern: ${{ inputs.test-pattern }}"
|
||||||
|
echo "branch: ${{ github.ref_name }}"
|
||||||
|
echo "::endgroup::"
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
with:
|
with:
|
||||||
ref: ${{ github.event.pull_request.head.ref }}
|
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Setup test environment
|
- name: Setup test environment
|
||||||
|
@ -96,14 +56,15 @@ jobs:
|
||||||
python-version: "3.12" # Use single Python version for recording
|
python-version: "3.12" # Use single Python version for recording
|
||||||
client-version: "latest"
|
client-version: "latest"
|
||||||
provider: ${{ inputs.test-provider || 'ollama' }}
|
provider: ${{ inputs.test-provider || 'ollama' }}
|
||||||
run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
|
run-vision-tests: ${{ inputs.run-vision-tests }}
|
||||||
inference-mode: 'record'
|
inference-mode: 'record'
|
||||||
|
|
||||||
- name: Run and record tests
|
- name: Run and record tests
|
||||||
uses: ./.github/actions/run-and-record-tests
|
uses: ./.github/actions/run-and-record-tests
|
||||||
with:
|
with:
|
||||||
test-types: ${{ needs.discover-tests.outputs.test-types }}
|
test-pattern: ${{ inputs.test-pattern }}
|
||||||
|
test-subdirs: ${{ inputs.test-subdirs }}
|
||||||
stack-config: 'server:ci-tests' # recording must be done with server since more tests are run
|
stack-config: 'server:ci-tests' # recording must be done with server since more tests are run
|
||||||
provider: ${{ inputs.test-provider || 'ollama' }}
|
provider: ${{ inputs.test-provider || 'ollama' }}
|
||||||
inference-mode: 'record'
|
inference-mode: 'record'
|
||||||
run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
|
run-vision-tests: ${{ inputs.run-vision-tests }}
|
||||||
|
|
2
.github/workflows/semantic-pr.yml
vendored
2
.github/workflows/semantic-pr.yml
vendored
|
@ -22,6 +22,6 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check PR Title's semantic conformance
|
- name: Check PR Title's semantic conformance
|
||||||
uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
|
uses: amannn/action-semantic-pull-request@48f256284bd46cdaab1048c3721360e808335d50 # v6.1.1
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
|
@ -27,7 +27,7 @@ jobs:
|
||||||
# container and point 'uv pip install' to the correct path...
|
# container and point 'uv pip install' to the correct path...
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
7
.github/workflows/test-external.yml
vendored
7
.github/workflows/test-external.yml
vendored
|
@ -9,6 +9,7 @@ on:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
|
- '!llama_stack/ui/**'
|
||||||
- 'tests/integration/**'
|
- 'tests/integration/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -26,7 +27,7 @@ jobs:
|
||||||
# container and point 'uv pip install' to the correct path...
|
# container and point 'uv pip install' to the correct path...
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -43,11 +44,11 @@ jobs:
|
||||||
|
|
||||||
- name: Print distro dependencies
|
- name: Print distro dependencies
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml --print-deps-only
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
|
||||||
|
|
||||||
- name: Build distro from config file
|
- name: Build distro from config file
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
|
||||||
|
|
||||||
- name: Start Llama Stack server in background
|
- name: Start Llama Stack server in background
|
||||||
if: ${{ matrix.image-type }} == 'venv'
|
if: ${{ matrix.image-type }} == 'venv'
|
||||||
|
|
55
.github/workflows/ui-unit-tests.yml
vendored
Normal file
55
.github/workflows/ui-unit-tests.yml
vendored
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
name: UI Tests
|
||||||
|
|
||||||
|
run-name: Run the UI test suite
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ main ]
|
||||||
|
paths:
|
||||||
|
- 'llama_stack/ui/**'
|
||||||
|
- '.github/workflows/ui-unit-tests.yml' # This workflow
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ui-tests:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
node-version: [22]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
|
- name: Setup Node.js
|
||||||
|
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
|
||||||
|
with:
|
||||||
|
node-version: ${{ matrix.node-version }}
|
||||||
|
cache: 'npm'
|
||||||
|
cache-dependency-path: 'llama_stack/ui/package-lock.json'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
working-directory: llama_stack/ui
|
||||||
|
run: npm ci
|
||||||
|
|
||||||
|
- name: Run linting
|
||||||
|
working-directory: llama_stack/ui
|
||||||
|
run: npm run lint
|
||||||
|
|
||||||
|
- name: Run format check
|
||||||
|
working-directory: llama_stack/ui
|
||||||
|
run: npm run format:check
|
||||||
|
|
||||||
|
- name: Run unit tests
|
||||||
|
working-directory: llama_stack/ui
|
||||||
|
env:
|
||||||
|
CI: true
|
||||||
|
|
||||||
|
run: npm test -- --coverage --watchAll=false --passWithNoTests
|
5
.github/workflows/unit-tests.yml
vendored
5
.github/workflows/unit-tests.yml
vendored
|
@ -9,6 +9,7 @@ on:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
|
- '!llama_stack/ui/**'
|
||||||
- 'tests/unit/**'
|
- 'tests/unit/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -17,7 +18,7 @@ on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -31,7 +32,7 @@ jobs:
|
||||||
- "3.13"
|
- "3.13"
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
4
.github/workflows/update-readthedocs.yml
vendored
4
.github/workflows/update-readthedocs.yml
vendored
|
@ -27,7 +27,7 @@ on:
|
||||||
- '.github/workflows/update-readthedocs.yml'
|
- '.github/workflows/update-readthedocs.yml'
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -37,7 +37,7 @@ jobs:
|
||||||
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
|
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
|
@ -2,6 +2,7 @@ exclude: 'build/'
|
||||||
|
|
||||||
default_language_version:
|
default_language_version:
|
||||||
python: python3.12
|
python: python3.12
|
||||||
|
node: "22"
|
||||||
|
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
@ -145,6 +146,32 @@ repos:
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
require_serial: true
|
require_serial: true
|
||||||
files: ^.github/workflows/.*$
|
files: ^.github/workflows/.*$
|
||||||
|
- id: ui-linter
|
||||||
|
name: Format & Lint UI
|
||||||
|
entry: bash ./scripts/run-ui-linter.sh
|
||||||
|
language: system
|
||||||
|
files: ^llama_stack/ui/.*\.(ts|tsx)$
|
||||||
|
pass_filenames: false
|
||||||
|
require_serial: true
|
||||||
|
|
||||||
|
- id: check-log-usage
|
||||||
|
name: Ensure 'llama_stack.log' usage for logging
|
||||||
|
entry: bash
|
||||||
|
language: system
|
||||||
|
types: [python]
|
||||||
|
pass_filenames: true
|
||||||
|
args:
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
|
||||||
|
if [ -n "$matches" ]; then
|
||||||
|
# GitHub Actions annotation format
|
||||||
|
while IFS=: read -r file line_num rest; do
|
||||||
|
echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
|
||||||
|
done <<< "$matches"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
exit 0
|
||||||
|
|
||||||
ci:
|
ci:
|
||||||
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
|
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
# Llama Stack
|
# Llama Stack
|
||||||
|
|
||||||
<a href="https://trendshift.io/repositories/11824" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11824" alt="meta-llama%2Fllama-stack | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
||||||
|
|
||||||
-----
|
|
||||||
[](https://pypi.org/project/llama_stack/)
|
[](https://pypi.org/project/llama_stack/)
|
||||||
[](https://pypi.org/project/llama-stack/)
|
[](https://pypi.org/project/llama-stack/)
|
||||||
[](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
|
[](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
|
||||||
|
|
281
docs/_static/llama-stack-spec.html
vendored
281
docs/_static/llama-stack-spec.html
vendored
|
@ -4605,6 +4605,49 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"/v1/inference/rerank": {
|
||||||
|
"post": {
|
||||||
|
"responses": {
|
||||||
|
"200": {
|
||||||
|
"description": "RerankResponse with indices sorted by relevance score (descending).",
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/RerankResponse"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"400": {
|
||||||
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
},
|
||||||
|
"429": {
|
||||||
|
"$ref": "#/components/responses/TooManyRequests429"
|
||||||
|
},
|
||||||
|
"500": {
|
||||||
|
"$ref": "#/components/responses/InternalServerError500"
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"$ref": "#/components/responses/DefaultError"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tags": [
|
||||||
|
"Inference"
|
||||||
|
],
|
||||||
|
"description": "Rerank a list of documents based on their relevance to a query.",
|
||||||
|
"parameters": [],
|
||||||
|
"requestBody": {
|
||||||
|
"content": {
|
||||||
|
"application/json": {
|
||||||
|
"schema": {
|
||||||
|
"$ref": "#/components/schemas/RerankRequest"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": {
|
"/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -8821,6 +8864,61 @@
|
||||||
"title": "OpenAIResponseOutputMessageMCPListTools",
|
"title": "OpenAIResponseOutputMessageMCPListTools",
|
||||||
"description": "MCP list tools output message containing available tools from an MCP server."
|
"description": "MCP list tools output message containing available tools from an MCP server."
|
||||||
},
|
},
|
||||||
|
"OpenAIResponseContentPart": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIResponseContentPartOutputText"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIResponseContentPartRefusal"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"discriminator": {
|
||||||
|
"propertyName": "type",
|
||||||
|
"mapping": {
|
||||||
|
"output_text": "#/components/schemas/OpenAIResponseContentPartOutputText",
|
||||||
|
"refusal": "#/components/schemas/OpenAIResponseContentPartRefusal"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"OpenAIResponseContentPartOutputText": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "output_text",
|
||||||
|
"default": "output_text"
|
||||||
|
},
|
||||||
|
"text": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"type",
|
||||||
|
"text"
|
||||||
|
],
|
||||||
|
"title": "OpenAIResponseContentPartOutputText"
|
||||||
|
},
|
||||||
|
"OpenAIResponseContentPartRefusal": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "refusal",
|
||||||
|
"default": "refusal"
|
||||||
|
},
|
||||||
|
"refusal": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"type",
|
||||||
|
"refusal"
|
||||||
|
],
|
||||||
|
"title": "OpenAIResponseContentPartRefusal"
|
||||||
|
},
|
||||||
"OpenAIResponseObjectStream": {
|
"OpenAIResponseObjectStream": {
|
||||||
"oneOf": [
|
"oneOf": [
|
||||||
{
|
{
|
||||||
|
@ -8877,6 +8975,12 @@
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
|
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
||||||
}
|
}
|
||||||
|
@ -8902,6 +9006,8 @@
|
||||||
"response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
|
"response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
|
||||||
"response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
|
"response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
|
||||||
"response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
|
"response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
|
||||||
|
"response.content_part.added": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded",
|
||||||
|
"response.content_part.done": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone",
|
||||||
"response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
"response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8928,6 +9034,80 @@
|
||||||
"title": "OpenAIResponseObjectStreamResponseCompleted",
|
"title": "OpenAIResponseObjectStreamResponseCompleted",
|
||||||
"description": "Streaming event indicating a response has been completed."
|
"description": "Streaming event indicating a response has been completed."
|
||||||
},
|
},
|
||||||
|
"OpenAIResponseObjectStreamResponseContentPartAdded": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"response_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Unique identifier of the response containing this content"
|
||||||
|
},
|
||||||
|
"item_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Unique identifier of the output item containing this content part"
|
||||||
|
},
|
||||||
|
"part": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIResponseContentPart",
|
||||||
|
"description": "The content part that was added"
|
||||||
|
},
|
||||||
|
"sequence_number": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Sequential number for ordering streaming events"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "response.content_part.added",
|
||||||
|
"default": "response.content_part.added",
|
||||||
|
"description": "Event type identifier, always \"response.content_part.added\""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"response_id",
|
||||||
|
"item_id",
|
||||||
|
"part",
|
||||||
|
"sequence_number",
|
||||||
|
"type"
|
||||||
|
],
|
||||||
|
"title": "OpenAIResponseObjectStreamResponseContentPartAdded",
|
||||||
|
"description": "Streaming event for when a new content part is added to a response item."
|
||||||
|
},
|
||||||
|
"OpenAIResponseObjectStreamResponseContentPartDone": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"response_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Unique identifier of the response containing this content"
|
||||||
|
},
|
||||||
|
"item_id": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Unique identifier of the output item containing this content part"
|
||||||
|
},
|
||||||
|
"part": {
|
||||||
|
"$ref": "#/components/schemas/OpenAIResponseContentPart",
|
||||||
|
"description": "The completed content part"
|
||||||
|
},
|
||||||
|
"sequence_number": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Sequential number for ordering streaming events"
|
||||||
|
},
|
||||||
|
"type": {
|
||||||
|
"type": "string",
|
||||||
|
"const": "response.content_part.done",
|
||||||
|
"default": "response.content_part.done",
|
||||||
|
"description": "Event type identifier, always \"response.content_part.done\""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"response_id",
|
||||||
|
"item_id",
|
||||||
|
"part",
|
||||||
|
"sequence_number",
|
||||||
|
"type"
|
||||||
|
],
|
||||||
|
"title": "OpenAIResponseObjectStreamResponseContentPartDone",
|
||||||
|
"description": "Streaming event for when a content part is completed."
|
||||||
|
},
|
||||||
"OpenAIResponseObjectStreamResponseCreated": {
|
"OpenAIResponseObjectStreamResponseCreated": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -14630,7 +14810,8 @@
|
||||||
"OpenAIFilePurpose": {
|
"OpenAIFilePurpose": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"enum": [
|
||||||
"assistants"
|
"assistants",
|
||||||
|
"batch"
|
||||||
],
|
],
|
||||||
"title": "OpenAIFilePurpose",
|
"title": "OpenAIFilePurpose",
|
||||||
"description": "Valid purpose values for OpenAI Files API."
|
"description": "Valid purpose values for OpenAI Files API."
|
||||||
|
@ -14707,7 +14888,8 @@
|
||||||
"purpose": {
|
"purpose": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"enum": [
|
||||||
"assistants"
|
"assistants",
|
||||||
|
"batch"
|
||||||
],
|
],
|
||||||
"description": "The intended purpose of the file"
|
"description": "The intended purpose of the file"
|
||||||
}
|
}
|
||||||
|
@ -15926,12 +16108,16 @@
|
||||||
"value": {
|
"value": {
|
||||||
"type": "number",
|
"type": "number",
|
||||||
"description": "The numeric value of the metric at this timestamp"
|
"description": "The numeric value of the metric at this timestamp"
|
||||||
|
},
|
||||||
|
"unit": {
|
||||||
|
"type": "string"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"timestamp",
|
"timestamp",
|
||||||
"value"
|
"value",
|
||||||
|
"unit"
|
||||||
],
|
],
|
||||||
"title": "MetricDataPoint",
|
"title": "MetricDataPoint",
|
||||||
"description": "A single data point in a metric time series."
|
"description": "A single data point in a metric time series."
|
||||||
|
@ -16489,6 +16675,95 @@
|
||||||
],
|
],
|
||||||
"title": "RegisterVectorDbRequest"
|
"title": "RegisterVectorDbRequest"
|
||||||
},
|
},
|
||||||
|
"RerankRequest": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"model": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The identifier of the reranking model to use."
|
||||||
|
},
|
||||||
|
"query": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"description": "The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length."
|
||||||
|
},
|
||||||
|
"items": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartTextParam"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/components/schemas/OpenAIChatCompletionContentPartImageParam"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"description": "List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length."
|
||||||
|
},
|
||||||
|
"max_num_results": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "(Optional) Maximum number of results to return. Default: returns all."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"model",
|
||||||
|
"query",
|
||||||
|
"items"
|
||||||
|
],
|
||||||
|
"title": "RerankRequest"
|
||||||
|
},
|
||||||
|
"RerankData": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"index": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "The original index of the document in the input list"
|
||||||
|
},
|
||||||
|
"relevance_score": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"index",
|
||||||
|
"relevance_score"
|
||||||
|
],
|
||||||
|
"title": "RerankData",
|
||||||
|
"description": "A single rerank result from a reranking response."
|
||||||
|
},
|
||||||
|
"RerankResponse": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"data": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/RerankData"
|
||||||
|
},
|
||||||
|
"description": "List of rerank result objects, sorted by relevance score (descending)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"data"
|
||||||
|
],
|
||||||
|
"title": "RerankResponse",
|
||||||
|
"description": "Response from a reranking request."
|
||||||
|
},
|
||||||
"ResumeAgentTurnRequest": {
|
"ResumeAgentTurnRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
217
docs/_static/llama-stack-spec.yaml
vendored
217
docs/_static/llama-stack-spec.yaml
vendored
|
@ -3264,6 +3264,37 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/QueryTracesRequest'
|
$ref: '#/components/schemas/QueryTracesRequest'
|
||||||
required: true
|
required: true
|
||||||
|
/v1/inference/rerank:
|
||||||
|
post:
|
||||||
|
responses:
|
||||||
|
'200':
|
||||||
|
description: >-
|
||||||
|
RerankResponse with indices sorted by relevance score (descending).
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/RerankResponse'
|
||||||
|
'400':
|
||||||
|
$ref: '#/components/responses/BadRequest400'
|
||||||
|
'429':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/TooManyRequests429
|
||||||
|
'500':
|
||||||
|
$ref: >-
|
||||||
|
#/components/responses/InternalServerError500
|
||||||
|
default:
|
||||||
|
$ref: '#/components/responses/DefaultError'
|
||||||
|
tags:
|
||||||
|
- Inference
|
||||||
|
description: >-
|
||||||
|
Rerank a list of documents based on their relevance to a query.
|
||||||
|
parameters: []
|
||||||
|
requestBody:
|
||||||
|
content:
|
||||||
|
application/json:
|
||||||
|
schema:
|
||||||
|
$ref: '#/components/schemas/RerankRequest'
|
||||||
|
required: true
|
||||||
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume:
|
/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
|
@ -6441,6 +6472,43 @@ components:
|
||||||
title: OpenAIResponseOutputMessageMCPListTools
|
title: OpenAIResponseOutputMessageMCPListTools
|
||||||
description: >-
|
description: >-
|
||||||
MCP list tools output message containing available tools from an MCP server.
|
MCP list tools output message containing available tools from an MCP server.
|
||||||
|
OpenAIResponseContentPart:
|
||||||
|
oneOf:
|
||||||
|
- $ref: '#/components/schemas/OpenAIResponseContentPartOutputText'
|
||||||
|
- $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
|
||||||
|
discriminator:
|
||||||
|
propertyName: type
|
||||||
|
mapping:
|
||||||
|
output_text: '#/components/schemas/OpenAIResponseContentPartOutputText'
|
||||||
|
refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
|
||||||
|
OpenAIResponseContentPartOutputText:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: output_text
|
||||||
|
default: output_text
|
||||||
|
text:
|
||||||
|
type: string
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- type
|
||||||
|
- text
|
||||||
|
title: OpenAIResponseContentPartOutputText
|
||||||
|
OpenAIResponseContentPartRefusal:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: refusal
|
||||||
|
default: refusal
|
||||||
|
refusal:
|
||||||
|
type: string
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- type
|
||||||
|
- refusal
|
||||||
|
title: OpenAIResponseContentPartRefusal
|
||||||
OpenAIResponseObjectStream:
|
OpenAIResponseObjectStream:
|
||||||
oneOf:
|
oneOf:
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
||||||
|
@ -6461,6 +6529,8 @@ components:
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
|
||||||
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
|
||||||
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
||||||
discriminator:
|
discriminator:
|
||||||
propertyName: type
|
propertyName: type
|
||||||
|
@ -6483,6 +6553,8 @@ components:
|
||||||
response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
|
response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
|
||||||
response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
|
response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
|
||||||
response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
|
response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
|
||||||
|
response.content_part.added: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
|
||||||
|
response.content_part.done: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
|
||||||
response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
||||||
"OpenAIResponseObjectStreamResponseCompleted":
|
"OpenAIResponseObjectStreamResponseCompleted":
|
||||||
type: object
|
type: object
|
||||||
|
@ -6504,6 +6576,76 @@ components:
|
||||||
OpenAIResponseObjectStreamResponseCompleted
|
OpenAIResponseObjectStreamResponseCompleted
|
||||||
description: >-
|
description: >-
|
||||||
Streaming event indicating a response has been completed.
|
Streaming event indicating a response has been completed.
|
||||||
|
"OpenAIResponseObjectStreamResponseContentPartAdded":
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
response_id:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
Unique identifier of the response containing this content
|
||||||
|
item_id:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
Unique identifier of the output item containing this content part
|
||||||
|
part:
|
||||||
|
$ref: '#/components/schemas/OpenAIResponseContentPart'
|
||||||
|
description: The content part that was added
|
||||||
|
sequence_number:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
Sequential number for ordering streaming events
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: response.content_part.added
|
||||||
|
default: response.content_part.added
|
||||||
|
description: >-
|
||||||
|
Event type identifier, always "response.content_part.added"
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- response_id
|
||||||
|
- item_id
|
||||||
|
- part
|
||||||
|
- sequence_number
|
||||||
|
- type
|
||||||
|
title: >-
|
||||||
|
OpenAIResponseObjectStreamResponseContentPartAdded
|
||||||
|
description: >-
|
||||||
|
Streaming event for when a new content part is added to a response item.
|
||||||
|
"OpenAIResponseObjectStreamResponseContentPartDone":
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
response_id:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
Unique identifier of the response containing this content
|
||||||
|
item_id:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
Unique identifier of the output item containing this content part
|
||||||
|
part:
|
||||||
|
$ref: '#/components/schemas/OpenAIResponseContentPart'
|
||||||
|
description: The completed content part
|
||||||
|
sequence_number:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
Sequential number for ordering streaming events
|
||||||
|
type:
|
||||||
|
type: string
|
||||||
|
const: response.content_part.done
|
||||||
|
default: response.content_part.done
|
||||||
|
description: >-
|
||||||
|
Event type identifier, always "response.content_part.done"
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- response_id
|
||||||
|
- item_id
|
||||||
|
- part
|
||||||
|
- sequence_number
|
||||||
|
- type
|
||||||
|
title: >-
|
||||||
|
OpenAIResponseObjectStreamResponseContentPartDone
|
||||||
|
description: >-
|
||||||
|
Streaming event for when a content part is completed.
|
||||||
"OpenAIResponseObjectStreamResponseCreated":
|
"OpenAIResponseObjectStreamResponseCreated":
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -10840,6 +10982,7 @@ components:
|
||||||
type: string
|
type: string
|
||||||
enum:
|
enum:
|
||||||
- assistants
|
- assistants
|
||||||
|
- batch
|
||||||
title: OpenAIFilePurpose
|
title: OpenAIFilePurpose
|
||||||
description: >-
|
description: >-
|
||||||
Valid purpose values for OpenAI Files API.
|
Valid purpose values for OpenAI Files API.
|
||||||
|
@ -10908,6 +11051,7 @@ components:
|
||||||
type: string
|
type: string
|
||||||
enum:
|
enum:
|
||||||
- assistants
|
- assistants
|
||||||
|
- batch
|
||||||
description: The intended purpose of the file
|
description: The intended purpose of the file
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
|
@ -11838,10 +11982,13 @@ components:
|
||||||
type: number
|
type: number
|
||||||
description: >-
|
description: >-
|
||||||
The numeric value of the metric at this timestamp
|
The numeric value of the metric at this timestamp
|
||||||
|
unit:
|
||||||
|
type: string
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- timestamp
|
- timestamp
|
||||||
- value
|
- value
|
||||||
|
- unit
|
||||||
title: MetricDataPoint
|
title: MetricDataPoint
|
||||||
description: >-
|
description: >-
|
||||||
A single data point in a metric time series.
|
A single data point in a metric time series.
|
||||||
|
@ -12252,6 +12399,76 @@ components:
|
||||||
- vector_db_id
|
- vector_db_id
|
||||||
- embedding_model
|
- embedding_model
|
||||||
title: RegisterVectorDbRequest
|
title: RegisterVectorDbRequest
|
||||||
|
RerankRequest:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
model:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
The identifier of the reranking model to use.
|
||||||
|
query:
|
||||||
|
oneOf:
|
||||||
|
- type: string
|
||||||
|
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
|
||||||
|
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
|
||||||
|
description: >-
|
||||||
|
The search query to rank items against. Can be a string, text content
|
||||||
|
part, or image content part. The input must not exceed the model's max
|
||||||
|
input token length.
|
||||||
|
items:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
oneOf:
|
||||||
|
- type: string
|
||||||
|
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartTextParam'
|
||||||
|
- $ref: '#/components/schemas/OpenAIChatCompletionContentPartImageParam'
|
||||||
|
description: >-
|
||||||
|
List of items to rerank. Each item can be a string, text content part,
|
||||||
|
or image content part. Each input must not exceed the model's max input
|
||||||
|
token length.
|
||||||
|
max_num_results:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
(Optional) Maximum number of results to return. Default: returns all.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- model
|
||||||
|
- query
|
||||||
|
- items
|
||||||
|
title: RerankRequest
|
||||||
|
RerankData:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
index:
|
||||||
|
type: integer
|
||||||
|
description: >-
|
||||||
|
The original index of the document in the input list
|
||||||
|
relevance_score:
|
||||||
|
type: number
|
||||||
|
description: >-
|
||||||
|
The relevance score from the model output. Values are inverted when applicable
|
||||||
|
so that higher scores indicate greater relevance.
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- index
|
||||||
|
- relevance_score
|
||||||
|
title: RerankData
|
||||||
|
description: >-
|
||||||
|
A single rerank result from a reranking response.
|
||||||
|
RerankResponse:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
data:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/RerankData'
|
||||||
|
description: >-
|
||||||
|
List of rerank result objects, sorted by relevance score (descending)
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- data
|
||||||
|
title: RerankResponse
|
||||||
|
description: Response from a reranking request.
|
||||||
ResumeAgentTurnRequest:
|
ResumeAgentTurnRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
|
|
@ -18,3 +18,4 @@ We are working on adding a few more APIs to complete the application lifecycle.
|
||||||
- **Batch Inference**: run inference on a dataset of inputs
|
- **Batch Inference**: run inference on a dataset of inputs
|
||||||
- **Batch Agents**: run agents on a dataset of inputs
|
- **Batch Agents**: run agents on a dataset of inputs
|
||||||
- **Synthetic Data Generation**: generate synthetic data for model development
|
- **Synthetic Data Generation**: generate synthetic data for model development
|
||||||
|
- **Batches**: OpenAI-compatible batch management for inference
|
||||||
|
|
|
@ -4,11 +4,11 @@
|
||||||
|
|
||||||
## Adding a New Provider
|
## Adding a New Provider
|
||||||
|
|
||||||
See the [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
|
See:
|
||||||
|
- [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
|
||||||
|
- [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
|
||||||
|
- [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
|
||||||
|
|
||||||
See the [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
|
|
||||||
|
|
||||||
See the [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
|
|
||||||
```{toctree}
|
```{toctree}
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:hidden:
|
:hidden:
|
||||||
|
@ -19,11 +19,21 @@ new_vector_database
|
||||||
|
|
||||||
## Testing
|
## Testing
|
||||||
|
|
||||||
See the [Test Page](testing.md) which describes how to test your changes.
|
|
||||||
|
```{include} ../../../tests/README.md
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced Topics
|
||||||
|
|
||||||
|
For developers who need deeper understanding of the testing system internals:
|
||||||
|
|
||||||
```{toctree}
|
```{toctree}
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:hidden:
|
|
||||||
:caption: Testing
|
|
||||||
|
|
||||||
testing
|
testing/record-replay
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Benchmarking
|
||||||
|
|
||||||
|
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
|
||||||
|
```
|
||||||
|
|
|
@ -1,8 +0,0 @@
|
||||||
```{include} ../../../tests/README.md
|
|
||||||
```
|
|
||||||
|
|
||||||
```{include} ../../../tests/unit/README.md
|
|
||||||
```
|
|
||||||
|
|
||||||
```{include} ../../../tests/integration/README.md
|
|
||||||
```
|
|
234
docs/source/contributing/testing/record-replay.md
Normal file
234
docs/source/contributing/testing/record-replay.md
Normal file
|
@ -0,0 +1,234 @@
|
||||||
|
# Record-Replay System
|
||||||
|
|
||||||
|
Understanding how Llama Stack captures and replays API interactions for testing.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The record-replay system solves a fundamental challenge in AI testing: how do you test against expensive, non-deterministic APIs without breaking the bank or dealing with flaky tests?
|
||||||
|
|
||||||
|
The solution: intercept API calls, store real responses, and replay them later. This gives you real API behavior without the cost or variability.
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### Request Hashing
|
||||||
|
|
||||||
|
Every API request gets converted to a deterministic hash for lookup:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def normalize_request(method: str, url: str, headers: dict, body: dict) -> str:
|
||||||
|
normalized = {
|
||||||
|
"method": method.upper(),
|
||||||
|
"endpoint": urlparse(url).path, # Just the path, not full URL
|
||||||
|
"body": body, # Request parameters
|
||||||
|
}
|
||||||
|
return hashlib.sha256(json.dumps(normalized, sort_keys=True).encode()).hexdigest()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key insight:** The hashing is intentionally precise. Different whitespace, float precision, or parameter order produces different hashes. This prevents subtle bugs from false cache hits.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# These produce DIFFERENT hashes:
|
||||||
|
{"content": "Hello world"}
|
||||||
|
{"content": "Hello world\n"}
|
||||||
|
{"temperature": 0.7}
|
||||||
|
{"temperature": 0.7000001}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Client Interception
|
||||||
|
|
||||||
|
The system patches OpenAI and Ollama client methods to intercept calls before they leave your application. This happens transparently - your test code doesn't change.
|
||||||
|
|
||||||
|
### Storage Architecture
|
||||||
|
|
||||||
|
Recordings use a two-tier storage system optimized for both speed and debuggability:
|
||||||
|
|
||||||
|
```
|
||||||
|
recordings/
|
||||||
|
├── index.sqlite # Fast lookup by request hash
|
||||||
|
└── responses/
|
||||||
|
├── abc123def456.json # Individual response files
|
||||||
|
└── def789ghi012.json
|
||||||
|
```
|
||||||
|
|
||||||
|
**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
|
||||||
|
|
||||||
|
**JSON files** store complete request/response pairs in human-readable format for debugging.
|
||||||
|
|
||||||
|
## Recording Modes
|
||||||
|
|
||||||
|
### LIVE Mode
|
||||||
|
|
||||||
|
Direct API calls with no recording or replay:
|
||||||
|
|
||||||
|
```python
|
||||||
|
with inference_recording(mode=InferenceMode.LIVE):
|
||||||
|
response = await client.chat.completions.create(...)
|
||||||
|
```
|
||||||
|
|
||||||
|
Use for initial development and debugging against real APIs.
|
||||||
|
|
||||||
|
### RECORD Mode
|
||||||
|
|
||||||
|
Captures API interactions while passing through real responses:
|
||||||
|
|
||||||
|
```python
|
||||||
|
with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
|
||||||
|
response = await client.chat.completions.create(...)
|
||||||
|
# Real API call made, response captured AND returned
|
||||||
|
```
|
||||||
|
|
||||||
|
The recording process:
|
||||||
|
1. Request intercepted and hashed
|
||||||
|
2. Real API call executed
|
||||||
|
3. Response captured and serialized
|
||||||
|
4. Recording stored to disk
|
||||||
|
5. Original response returned to caller
|
||||||
|
|
||||||
|
### REPLAY Mode
|
||||||
|
|
||||||
|
Returns stored responses instead of making API calls:
|
||||||
|
|
||||||
|
```python
|
||||||
|
with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
|
||||||
|
response = await client.chat.completions.create(...)
|
||||||
|
# No API call made, cached response returned instantly
|
||||||
|
```
|
||||||
|
|
||||||
|
The replay process:
|
||||||
|
1. Request intercepted and hashed
|
||||||
|
2. Hash looked up in SQLite index
|
||||||
|
3. Response loaded from JSON file
|
||||||
|
4. Response deserialized and returned
|
||||||
|
5. Error if no recording found
|
||||||
|
|
||||||
|
## Streaming Support
|
||||||
|
|
||||||
|
Streaming APIs present a unique challenge: how do you capture an async generator?
|
||||||
|
|
||||||
|
### The Problem
|
||||||
|
|
||||||
|
```python
|
||||||
|
# How do you record this?
|
||||||
|
async for chunk in client.chat.completions.create(stream=True):
|
||||||
|
process(chunk)
|
||||||
|
```
|
||||||
|
|
||||||
|
### The Solution
|
||||||
|
|
||||||
|
The system captures all chunks immediately before yielding any:
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def handle_streaming_record(response):
|
||||||
|
# Capture complete stream first
|
||||||
|
chunks = []
|
||||||
|
async for chunk in response:
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
# Store complete recording
|
||||||
|
storage.store_recording(
|
||||||
|
request_hash, request_data, {"body": chunks, "is_streaming": True}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Return generator that replays captured chunks
|
||||||
|
async def replay_stream():
|
||||||
|
for chunk in chunks:
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
return replay_stream()
|
||||||
|
```
|
||||||
|
|
||||||
|
This ensures:
|
||||||
|
- **Complete capture** - The entire stream is saved atomically
|
||||||
|
- **Interface preservation** - The returned object behaves like the original API
|
||||||
|
- **Deterministic replay** - Same chunks in the same order every time
|
||||||
|
|
||||||
|
## Serialization
|
||||||
|
|
||||||
|
API responses contain complex Pydantic objects that need careful serialization:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _serialize_response(response):
|
||||||
|
if hasattr(response, "model_dump"):
|
||||||
|
# Preserve type information for proper deserialization
|
||||||
|
return {
|
||||||
|
"__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
|
||||||
|
"__data__": response.model_dump(mode="json"),
|
||||||
|
}
|
||||||
|
return response
|
||||||
|
```
|
||||||
|
|
||||||
|
This preserves type safety - when replayed, you get the same Pydantic objects with all their validation and methods.
|
||||||
|
|
||||||
|
## Environment Integration
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
Control recording behavior globally:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export LLAMA_STACK_TEST_INFERENCE_MODE=replay
|
||||||
|
export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
|
||||||
|
pytest tests/integration/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pytest Integration
|
||||||
|
|
||||||
|
The system integrates automatically based on environment variables, requiring no changes to test code.
|
||||||
|
|
||||||
|
## Debugging Recordings
|
||||||
|
|
||||||
|
### Inspecting Storage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# See what's recorded
|
||||||
|
sqlite3 recordings/index.sqlite "SELECT endpoint, model, timestamp FROM recordings LIMIT 10;"
|
||||||
|
|
||||||
|
# View specific response
|
||||||
|
cat recordings/responses/abc123def456.json | jq '.response.body'
|
||||||
|
|
||||||
|
# Find recordings by endpoint
|
||||||
|
sqlite3 recordings/index.sqlite "SELECT * FROM recordings WHERE endpoint='/v1/chat/completions';"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Common Issues
|
||||||
|
|
||||||
|
**Hash mismatches:** Request parameters changed slightly between record and replay
|
||||||
|
```bash
|
||||||
|
# Compare request details
|
||||||
|
cat recordings/responses/abc123.json | jq '.request'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Serialization errors:** Response types changed between versions
|
||||||
|
```bash
|
||||||
|
# Re-record with updated types
|
||||||
|
rm recordings/responses/failing_hash.json
|
||||||
|
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_failing.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Missing recordings:** New test or changed parameters
|
||||||
|
```bash
|
||||||
|
# Record the missing interaction
|
||||||
|
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_new.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Design Decisions
|
||||||
|
|
||||||
|
### Why Not Mocks?
|
||||||
|
|
||||||
|
Traditional mocking breaks down with AI APIs because:
|
||||||
|
- Response structures are complex and evolve frequently
|
||||||
|
- Streaming behavior is hard to mock correctly
|
||||||
|
- Edge cases in real APIs get missed
|
||||||
|
- Mocks become brittle maintenance burdens
|
||||||
|
|
||||||
|
### Why Precise Hashing?
|
||||||
|
|
||||||
|
Loose hashing (normalizing whitespace, rounding floats) seems convenient but hides bugs. If a test changes slightly, you want to know about it rather than accidentally getting the wrong cached response.
|
||||||
|
|
||||||
|
### Why JSON + SQLite?
|
||||||
|
|
||||||
|
- **JSON** - Human readable, diff-friendly, easy to inspect and modify
|
||||||
|
- **SQLite** - Fast indexed lookups without loading response bodies
|
||||||
|
- **Hybrid** - Best of both worlds for different use cases
|
||||||
|
|
||||||
|
This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.
|
|
@ -225,8 +225,32 @@ server:
|
||||||
port: 8321 # Port to listen on (default: 8321)
|
port: 8321 # Port to listen on (default: 8321)
|
||||||
tls_certfile: "/path/to/cert.pem" # Optional: Path to TLS certificate for HTTPS
|
tls_certfile: "/path/to/cert.pem" # Optional: Path to TLS certificate for HTTPS
|
||||||
tls_keyfile: "/path/to/key.pem" # Optional: Path to TLS key for HTTPS
|
tls_keyfile: "/path/to/key.pem" # Optional: Path to TLS key for HTTPS
|
||||||
|
cors: true # Optional: Enable CORS (dev mode) or full config object
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### CORS Configuration
|
||||||
|
|
||||||
|
CORS (Cross-Origin Resource Sharing) can be configured in two ways:
|
||||||
|
|
||||||
|
**Local development** (allows localhost origins only):
|
||||||
|
```yaml
|
||||||
|
server:
|
||||||
|
cors: true
|
||||||
|
```
|
||||||
|
|
||||||
|
**Explicit configuration** (custom origins and settings):
|
||||||
|
```yaml
|
||||||
|
server:
|
||||||
|
cors:
|
||||||
|
allow_origins: ["https://myapp.com", "https://app.example.com"]
|
||||||
|
allow_methods: ["GET", "POST", "PUT", "DELETE"]
|
||||||
|
allow_headers: ["Content-Type", "Authorization"]
|
||||||
|
allow_credentials: true
|
||||||
|
max_age: 3600
|
||||||
|
```
|
||||||
|
|
||||||
|
When `cors: true`, the server enables secure localhost-only access for local development. For production, specify exact origins to maintain security.
|
||||||
|
|
||||||
### Authentication Configuration
|
### Authentication Configuration
|
||||||
|
|
||||||
> **Breaking Change (v0.2.14)**: The authentication configuration structure has changed. The previous format with `provider_type` and `config` fields has been replaced with a unified `provider_config` field that includes the `type` field. Update your configuration files accordingly.
|
> **Breaking Change (v0.2.14)**: The authentication configuration structure has changed. The previous format with `provider_type` and `config` fields has been replaced with a unified `provider_config` field that includes the `type` field. Update your configuration files accordingly.
|
||||||
|
@ -618,6 +642,54 @@ Content-Type: application/json
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### CORS Configuration
|
||||||
|
|
||||||
|
Configure CORS to allow web browsers to make requests from different domains. Disabled by default.
|
||||||
|
|
||||||
|
#### Quick Setup
|
||||||
|
|
||||||
|
For development, use the simple boolean flag:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
server:
|
||||||
|
cors: true # Auto-enables localhost with any port
|
||||||
|
```
|
||||||
|
|
||||||
|
This automatically allows `http://localhost:*` and `https://localhost:*` with secure defaults.
|
||||||
|
|
||||||
|
#### Custom Configuration
|
||||||
|
|
||||||
|
For specific origins and full control:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
server:
|
||||||
|
cors:
|
||||||
|
allow_origins: ["https://myapp.com", "https://staging.myapp.com"]
|
||||||
|
allow_credentials: true
|
||||||
|
allow_methods: ["GET", "POST", "PUT", "DELETE"]
|
||||||
|
allow_headers: ["Content-Type", "Authorization"]
|
||||||
|
allow_origin_regex: "https://.*\\.example\\.com" # Optional regex pattern
|
||||||
|
expose_headers: ["X-Total-Count"]
|
||||||
|
max_age: 86400
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Configuration Options
|
||||||
|
|
||||||
|
| Field | Description | Default |
|
||||||
|
| -------------------- | ---------------------------------------------- | ------- |
|
||||||
|
| `allow_origins` | List of allowed origins. Use `["*"]` for any. | `["*"]` |
|
||||||
|
| `allow_origin_regex` | Regex pattern for allowed origins (optional). | `None` |
|
||||||
|
| `allow_methods` | Allowed HTTP methods. | `["*"]` |
|
||||||
|
| `allow_headers` | Allowed headers. | `["*"]` |
|
||||||
|
| `allow_credentials` | Allow credentials (cookies, auth headers). | `false` |
|
||||||
|
| `expose_headers` | Headers exposed to browser. | `[]` |
|
||||||
|
| `max_age` | Preflight cache time (seconds). | `600` |
|
||||||
|
|
||||||
|
**Security Notes**:
|
||||||
|
- `allow_credentials: true` requires explicit origins (no wildcards)
|
||||||
|
- `cors: true` enables localhost access only (secure for development)
|
||||||
|
- For public APIs, always specify exact allowed origins
|
||||||
|
|
||||||
## Extending to handle Safety
|
## Extending to handle Safety
|
||||||
|
|
||||||
Configuring Safety can be a little involved so it is instructive to go through an example.
|
Configuring Safety can be a little involved so it is instructive to go through an example.
|
||||||
|
|
|
@ -17,7 +17,6 @@ client = LlamaStackAsLibraryClient(
|
||||||
# provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
|
# provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
|
||||||
provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
|
provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
|
||||||
)
|
)
|
||||||
client.initialize()
|
|
||||||
```
|
```
|
||||||
|
|
||||||
This will parse your config and set up any inline implementations and remote clients needed for your implementation.
|
This will parse your config and set up any inline implementations and remote clients needed for your implementation.
|
||||||
|
@ -32,5 +31,4 @@ If you've created a [custom distribution](https://llama-stack.readthedocs.io/en/
|
||||||
|
|
||||||
```python
|
```python
|
||||||
client = LlamaStackAsLibraryClient(config_path)
|
client = LlamaStackAsLibraryClient(config_path)
|
||||||
client.initialize()
|
|
||||||
```
|
```
|
||||||
|
|
156
docs/source/distributions/k8s-benchmark/README.md
Normal file
156
docs/source/distributions/k8s-benchmark/README.md
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
# Llama Stack Benchmark Suite on Kubernetes
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
|
||||||
|
Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
|
||||||
|
|
||||||
|
### Why This Benchmark Suite Exists
|
||||||
|
|
||||||
|
**Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
|
||||||
|
- Llama Stack inference (with vLLM backend)
|
||||||
|
- Direct vLLM inference calls
|
||||||
|
- Both under identical Kubernetes deployment conditions
|
||||||
|
|
||||||
|
**Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
|
||||||
|
|
||||||
|
**Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
|
||||||
|
|
||||||
|
**Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
|
||||||
|
- Kubernetes resource allocation (CPU, memory, GPU)
|
||||||
|
- Auto-scaling configurations
|
||||||
|
- Cost optimization strategies
|
||||||
|
|
||||||
|
### Key Metrics Captured
|
||||||
|
|
||||||
|
The benchmark suite measures critical performance indicators:
|
||||||
|
- **Throughput**: Requests per second under sustained load
|
||||||
|
- **Latency Distribution**: P50, P95, P99 response times
|
||||||
|
- **Time to First Token (TTFT)**: Critical for streaming applications
|
||||||
|
- **Error Rates**: Request failures and timeout analysis
|
||||||
|
|
||||||
|
This data enables data-driven architectural decisions and performance optimization efforts.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
**1. Deploy base k8s infrastructure:**
|
||||||
|
```bash
|
||||||
|
cd ../k8s
|
||||||
|
./apply.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. Deploy benchmark components:**
|
||||||
|
```bash
|
||||||
|
cd ../k8s-benchmark
|
||||||
|
./apply.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. Verify deployment:**
|
||||||
|
```bash
|
||||||
|
kubectl get pods
|
||||||
|
# Should see: llama-stack-benchmark-server, vllm-server, etc.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Basic Benchmarks
|
||||||
|
|
||||||
|
**Benchmark Llama Stack (default):**
|
||||||
|
```bash
|
||||||
|
cd docs/source/distributions/k8s-benchmark/
|
||||||
|
./run-benchmark.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benchmark vLLM direct:**
|
||||||
|
```bash
|
||||||
|
./run-benchmark.sh --target vllm
|
||||||
|
```
|
||||||
|
|
||||||
|
### Custom Configuration
|
||||||
|
|
||||||
|
**Extended benchmark with high concurrency:**
|
||||||
|
```bash
|
||||||
|
./run-benchmark.sh --target vllm --duration 120 --concurrent 20
|
||||||
|
```
|
||||||
|
|
||||||
|
**Short test run:**
|
||||||
|
```bash
|
||||||
|
./run-benchmark.sh --target stack --duration 30 --concurrent 5
|
||||||
|
```
|
||||||
|
|
||||||
|
## Command Reference
|
||||||
|
|
||||||
|
### run-benchmark.sh Options
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./run-benchmark.sh [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-t, --target <stack|vllm> Target to benchmark (default: stack)
|
||||||
|
-d, --duration <seconds> Duration in seconds (default: 60)
|
||||||
|
-c, --concurrent <users> Number of concurrent users (default: 10)
|
||||||
|
-h, --help Show help message
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
./run-benchmark.sh --target vllm # Benchmark vLLM direct
|
||||||
|
./run-benchmark.sh --target stack # Benchmark Llama Stack
|
||||||
|
./run-benchmark.sh -t vllm -d 120 -c 20 # vLLM with 120s, 20 users
|
||||||
|
```
|
||||||
|
|
||||||
|
## Local Testing
|
||||||
|
|
||||||
|
### Running Benchmark Locally
|
||||||
|
|
||||||
|
For local development without Kubernetes:
|
||||||
|
|
||||||
|
**1. Start OpenAI mock server:**
|
||||||
|
```bash
|
||||||
|
uv run python openai-mock-server.py --port 8080
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. Run benchmark against mock server:**
|
||||||
|
```bash
|
||||||
|
uv run python benchmark.py \
|
||||||
|
--base-url http://localhost:8080/v1 \
|
||||||
|
--model mock-inference \
|
||||||
|
--duration 30 \
|
||||||
|
--concurrent 5
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. Test against local vLLM server:**
|
||||||
|
```bash
|
||||||
|
# If you have vLLM running locally on port 8000
|
||||||
|
uv run python benchmark.py \
|
||||||
|
--base-url http://localhost:8000/v1 \
|
||||||
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--duration 30 \
|
||||||
|
--concurrent 5
|
||||||
|
```
|
||||||
|
|
||||||
|
**4. Profile the running server:**
|
||||||
|
```bash
|
||||||
|
./profile_running_server.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### OpenAI Mock Server
|
||||||
|
|
||||||
|
The `openai-mock-server.py` provides:
|
||||||
|
- **OpenAI-compatible API** for testing without real models
|
||||||
|
- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
|
||||||
|
- **Consistent responses** for reproducible benchmarks
|
||||||
|
- **Lightweight testing** without GPU requirements
|
||||||
|
|
||||||
|
**Mock server usage:**
|
||||||
|
```bash
|
||||||
|
uv run python openai-mock-server.py --port 8080
|
||||||
|
```
|
||||||
|
|
||||||
|
The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
|
||||||
|
|
||||||
|
## Files in this Directory
|
||||||
|
|
||||||
|
- `benchmark.py` - Core benchmark script with async streaming support
|
||||||
|
- `run-benchmark.sh` - Main script with target selection and configuration
|
||||||
|
- `openai-mock-server.py` - Mock OpenAI API server for local testing
|
||||||
|
- `README.md` - This documentation file
|
|
@ -8,7 +8,6 @@
|
||||||
|
|
||||||
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
|
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
|
||||||
|
|
||||||
export MOCK_INFERENCE_PORT=8080
|
|
||||||
export STREAM_DELAY_SECONDS=0.005
|
export STREAM_DELAY_SECONDS=0.005
|
||||||
|
|
||||||
export POSTGRES_USER=llamastack
|
export POSTGRES_USER=llamastack
|
||||||
|
@ -20,14 +19,7 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
|
||||||
export MOCK_INFERENCE_MODEL=mock-inference
|
export MOCK_INFERENCE_MODEL=mock-inference
|
||||||
|
|
||||||
# Use llama-stack-benchmark-service as the benchmark server
|
export MOCK_INFERENCE_URL=openai-mock-service:8080
|
||||||
export LOCUST_HOST=http://llama-stack-benchmark-service:8323
|
|
||||||
export LOCUST_BASE_PATH=/v1/openai/v1
|
|
||||||
|
|
||||||
# Use vllm-service as the benchmark server
|
|
||||||
# export LOCUST_HOST=http://vllm-server:8000
|
|
||||||
# export LOCUST_BASE_PATH=/v1
|
|
||||||
|
|
||||||
|
|
||||||
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
|
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
|
|
||||||
|
@ -35,13 +27,6 @@ set -euo pipefail
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
# Deploy benchmark-specific components
|
# Deploy benchmark-specific components
|
||||||
# Deploy OpenAI mock server
|
|
||||||
kubectl create configmap openai-mock --from-file=openai-mock-server.py \
|
|
||||||
--dry-run=client -o yaml | kubectl apply --validate=false -f -
|
|
||||||
|
|
||||||
envsubst < openai-mock-deployment.yaml | kubectl apply --validate=false -f -
|
|
||||||
|
|
||||||
# Create configmap with our custom stack config
|
|
||||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||||
--dry-run=client -o yaml > stack-configmap.yaml
|
--dry-run=client -o yaml > stack-configmap.yaml
|
||||||
|
|
||||||
|
@ -49,9 +34,3 @@ kubectl apply --validate=false -f stack-configmap.yaml
|
||||||
|
|
||||||
# Deploy our custom llama stack server (overriding the base one)
|
# Deploy our custom llama stack server (overriding the base one)
|
||||||
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
|
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
|
||||||
|
|
||||||
# Deploy Locust load testing
|
|
||||||
kubectl create configmap locust-script --from-file=locustfile.py \
|
|
||||||
--dry-run=client -o yaml | kubectl apply --validate=false -f -
|
|
||||||
|
|
||||||
envsubst < locust-k8s.yaml | kubectl apply --validate=false -f -
|
|
||||||
|
|
267
docs/source/distributions/k8s-benchmark/benchmark.py
Normal file
267
docs/source/distributions/k8s-benchmark/benchmark.py
Normal file
|
@ -0,0 +1,267 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Simple benchmark script for Llama Stack with OpenAI API compatibility.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import statistics
|
||||||
|
import time
|
||||||
|
from typing import Tuple
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkStats:
|
||||||
|
def __init__(self):
|
||||||
|
self.response_times = []
|
||||||
|
self.ttft_times = []
|
||||||
|
self.chunks_received = []
|
||||||
|
self.errors = []
|
||||||
|
self.success_count = 0
|
||||||
|
self.total_requests = 0
|
||||||
|
self.concurrent_users = 0
|
||||||
|
self.start_time = None
|
||||||
|
self.end_time = None
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
|
||||||
|
async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
|
||||||
|
async with self._lock:
|
||||||
|
self.total_requests += 1
|
||||||
|
if error:
|
||||||
|
self.errors.append(error)
|
||||||
|
else:
|
||||||
|
self.success_count += 1
|
||||||
|
self.response_times.append(response_time)
|
||||||
|
self.chunks_received.append(chunks)
|
||||||
|
if ttft is not None:
|
||||||
|
self.ttft_times.append(ttft)
|
||||||
|
|
||||||
|
def print_summary(self):
|
||||||
|
if not self.response_times:
|
||||||
|
print("No successful requests to report")
|
||||||
|
if self.errors:
|
||||||
|
print(f"Total errors: {len(self.errors)}")
|
||||||
|
print("First 5 errors:")
|
||||||
|
for error in self.errors[:5]:
|
||||||
|
print(f" {error}")
|
||||||
|
return
|
||||||
|
|
||||||
|
total_time = self.end_time - self.start_time
|
||||||
|
success_rate = (self.success_count / self.total_requests) * 100
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"BENCHMARK RESULTS")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Total time: {total_time:.2f}s")
|
||||||
|
print(f"Concurrent users: {self.concurrent_users}")
|
||||||
|
print(f"Total requests: {self.total_requests}")
|
||||||
|
print(f"Successful requests: {self.success_count}")
|
||||||
|
print(f"Failed requests: {len(self.errors)}")
|
||||||
|
print(f"Success rate: {success_rate:.1f}%")
|
||||||
|
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
||||||
|
|
||||||
|
print(f"\nResponse Time Statistics:")
|
||||||
|
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
|
||||||
|
print(f" Median: {statistics.median(self.response_times):.3f}s")
|
||||||
|
print(f" Min: {min(self.response_times):.3f}s")
|
||||||
|
print(f" Max: {max(self.response_times):.3f}s")
|
||||||
|
|
||||||
|
if len(self.response_times) > 1:
|
||||||
|
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
|
||||||
|
|
||||||
|
percentiles = [50, 90, 95, 99]
|
||||||
|
sorted_times = sorted(self.response_times)
|
||||||
|
print(f"\nPercentiles:")
|
||||||
|
for p in percentiles:
|
||||||
|
idx = int(len(sorted_times) * p / 100) - 1
|
||||||
|
idx = max(0, min(idx, len(sorted_times) - 1))
|
||||||
|
print(f" P{p}: {sorted_times[idx]:.3f}s")
|
||||||
|
|
||||||
|
if self.ttft_times:
|
||||||
|
print(f"\nTime to First Token (TTFT) Statistics:")
|
||||||
|
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
|
||||||
|
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
|
||||||
|
print(f" Min: {min(self.ttft_times):.3f}s")
|
||||||
|
print(f" Max: {max(self.ttft_times):.3f}s")
|
||||||
|
|
||||||
|
if len(self.ttft_times) > 1:
|
||||||
|
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
|
||||||
|
|
||||||
|
sorted_ttft = sorted(self.ttft_times)
|
||||||
|
print(f"\nTTFT Percentiles:")
|
||||||
|
for p in percentiles:
|
||||||
|
idx = int(len(sorted_ttft) * p / 100) - 1
|
||||||
|
idx = max(0, min(idx, len(sorted_ttft) - 1))
|
||||||
|
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
|
||||||
|
|
||||||
|
if self.chunks_received:
|
||||||
|
print(f"\nStreaming Statistics:")
|
||||||
|
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
|
||||||
|
print(f" Total chunks received: {sum(self.chunks_received)}")
|
||||||
|
|
||||||
|
if self.errors:
|
||||||
|
print(f"\nErrors (showing first 5):")
|
||||||
|
for error in self.errors[:5]:
|
||||||
|
print(f" {error}")
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaStackBenchmark:
|
||||||
|
def __init__(self, base_url: str, model_id: str):
|
||||||
|
self.base_url = base_url.rstrip('/')
|
||||||
|
self.model_id = model_id
|
||||||
|
self.headers = {"Content-Type": "application/json"}
|
||||||
|
self.test_messages = [
|
||||||
|
[{"role": "user", "content": "Hi"}],
|
||||||
|
[{"role": "user", "content": "What is the capital of France?"}],
|
||||||
|
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
||||||
|
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
||||||
|
[
|
||||||
|
{"role": "user", "content": "What is machine learning?"},
|
||||||
|
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
||||||
|
{"role": "user", "content": "Can you give me a practical example?"}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
|
||||||
|
"""Make a single async streaming chat completion request."""
|
||||||
|
messages = random.choice(self.test_messages)
|
||||||
|
payload = {
|
||||||
|
"model": self.model_id,
|
||||||
|
"messages": messages,
|
||||||
|
"stream": True,
|
||||||
|
"max_tokens": 100
|
||||||
|
}
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
chunks_received = 0
|
||||||
|
ttft = None
|
||||||
|
error = None
|
||||||
|
|
||||||
|
session = aiohttp.ClientSession()
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with session.post(
|
||||||
|
f"{self.base_url}/chat/completions",
|
||||||
|
headers=self.headers,
|
||||||
|
json=payload,
|
||||||
|
timeout=aiohttp.ClientTimeout(total=30)
|
||||||
|
) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for line in response.content:
|
||||||
|
if line:
|
||||||
|
line_str = line.decode('utf-8').strip()
|
||||||
|
if line_str.startswith('data: '):
|
||||||
|
chunks_received += 1
|
||||||
|
if ttft is None:
|
||||||
|
ttft = time.time() - start_time
|
||||||
|
if line_str == 'data: [DONE]':
|
||||||
|
break
|
||||||
|
|
||||||
|
if chunks_received == 0:
|
||||||
|
error = "No streaming chunks received"
|
||||||
|
else:
|
||||||
|
text = await response.text()
|
||||||
|
error = f"HTTP {response.status}: {text[:100]}"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error = f"Request error: {str(e)}"
|
||||||
|
finally:
|
||||||
|
await session.close()
|
||||||
|
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
return response_time, chunks_received, ttft, error
|
||||||
|
|
||||||
|
|
||||||
|
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
|
||||||
|
"""Run benchmark using async requests for specified duration."""
|
||||||
|
stats = BenchmarkStats()
|
||||||
|
stats.concurrent_users = concurrent_users
|
||||||
|
stats.start_time = time.time()
|
||||||
|
|
||||||
|
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
|
||||||
|
print(f"Target URL: {self.base_url}/chat/completions")
|
||||||
|
print(f"Model: {self.model_id}")
|
||||||
|
|
||||||
|
connector = aiohttp.TCPConnector(limit=concurrent_users)
|
||||||
|
async with aiohttp.ClientSession(connector=connector) as session:
|
||||||
|
|
||||||
|
async def worker(worker_id: int):
|
||||||
|
"""Worker that sends requests sequentially until canceled."""
|
||||||
|
request_count = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
response_time, chunks, ttft, error = await self.make_async_streaming_request()
|
||||||
|
await stats.add_result(response_time, chunks, ttft, error)
|
||||||
|
request_count += 1
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
|
||||||
|
|
||||||
|
# Progress reporting task
|
||||||
|
async def progress_reporter():
|
||||||
|
last_report_time = time.time()
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
await asyncio.sleep(1) # Report every second
|
||||||
|
if time.time() >= last_report_time + 10: # Report every 10 seconds
|
||||||
|
elapsed = time.time() - stats.start_time
|
||||||
|
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
|
||||||
|
last_report_time = time.time()
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Spawn concurrent workers
|
||||||
|
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
|
||||||
|
progress_task = asyncio.create_task(progress_reporter())
|
||||||
|
tasks.append(progress_task)
|
||||||
|
|
||||||
|
# Wait for duration then cancel all tasks
|
||||||
|
await asyncio.sleep(duration)
|
||||||
|
|
||||||
|
for task in tasks:
|
||||||
|
task.cancel()
|
||||||
|
|
||||||
|
# Wait for all tasks to complete
|
||||||
|
await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
stats.end_time = time.time()
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
|
||||||
|
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
|
||||||
|
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
|
||||||
|
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
|
||||||
|
help="Model ID to use for requests")
|
||||||
|
parser.add_argument("--duration", type=int, default=60,
|
||||||
|
help="Duration in seconds to run benchmark (default: 60)")
|
||||||
|
parser.add_argument("--concurrent", type=int, default=10,
|
||||||
|
help="Number of concurrent users (default: 10)")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
benchmark = LlamaStackBenchmark(args.base_url, args.model)
|
||||||
|
|
||||||
|
try:
|
||||||
|
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
|
||||||
|
stats.print_summary()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nBenchmark interrupted by user")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Benchmark failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -1,131 +0,0 @@
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: locust-master
|
|
||||||
labels:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: locust-master
|
|
||||||
image: locustio/locust:2.31.8
|
|
||||||
ports:
|
|
||||||
- containerPort: 8089 # Web UI
|
|
||||||
- containerPort: 5557 # Master communication
|
|
||||||
env:
|
|
||||||
- name: LOCUST_HOST
|
|
||||||
value: "${LOCUST_HOST}"
|
|
||||||
- name: LOCUST_LOCUSTFILE
|
|
||||||
value: "/locust/locustfile.py"
|
|
||||||
- name: LOCUST_WEB_HOST
|
|
||||||
value: "0.0.0.0"
|
|
||||||
- name: LOCUST_MASTER
|
|
||||||
value: "true"
|
|
||||||
- name: LOCUST_BASE_PATH
|
|
||||||
value: "${LOCUST_BASE_PATH}"
|
|
||||||
- name: INFERENCE_MODEL
|
|
||||||
value: "${BENCHMARK_INFERENCE_MODEL}"
|
|
||||||
volumeMounts:
|
|
||||||
- name: locust-script
|
|
||||||
mountPath: /locust
|
|
||||||
command: ["locust"]
|
|
||||||
args:
|
|
||||||
- "--master"
|
|
||||||
- "--web-host=0.0.0.0"
|
|
||||||
- "--web-port=8089"
|
|
||||||
- "--host=${LOCUST_HOST}"
|
|
||||||
- "--locustfile=/locust/locustfile.py"
|
|
||||||
volumes:
|
|
||||||
- name: locust-script
|
|
||||||
configMap:
|
|
||||||
name: locust-script
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: locust-worker
|
|
||||||
labels:
|
|
||||||
app: locust
|
|
||||||
role: worker
|
|
||||||
spec:
|
|
||||||
replicas: 2 # Start with 2 workers, can be scaled up
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: locust
|
|
||||||
role: worker
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: locust
|
|
||||||
role: worker
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: locust-worker
|
|
||||||
image: locustio/locust:2.31.8
|
|
||||||
env:
|
|
||||||
- name: LOCUST_HOST
|
|
||||||
value: "${LOCUST_HOST}"
|
|
||||||
- name: LOCUST_LOCUSTFILE
|
|
||||||
value: "/locust/locustfile.py"
|
|
||||||
- name: LOCUST_MASTER_HOST
|
|
||||||
value: "locust-master-service"
|
|
||||||
- name: LOCUST_MASTER_PORT
|
|
||||||
value: "5557"
|
|
||||||
- name: INFERENCE_MODEL
|
|
||||||
value: "${BENCHMARK_INFERENCE_MODEL}"
|
|
||||||
- name: LOCUST_BASE_PATH
|
|
||||||
value: "${LOCUST_BASE_PATH}"
|
|
||||||
volumeMounts:
|
|
||||||
- name: locust-script
|
|
||||||
mountPath: /locust
|
|
||||||
command: ["locust"]
|
|
||||||
args:
|
|
||||||
- "--worker"
|
|
||||||
- "--master-host=locust-master-service"
|
|
||||||
- "--master-port=5557"
|
|
||||||
- "--locustfile=/locust/locustfile.py"
|
|
||||||
volumes:
|
|
||||||
- name: locust-script
|
|
||||||
configMap:
|
|
||||||
name: locust-script
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: locust-master-service
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
ports:
|
|
||||||
- name: web-ui
|
|
||||||
port: 8089
|
|
||||||
targetPort: 8089
|
|
||||||
- name: master-comm
|
|
||||||
port: 5557
|
|
||||||
targetPort: 5557
|
|
||||||
type: ClusterIP
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: locust-web-ui
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: locust
|
|
||||||
role: master
|
|
||||||
ports:
|
|
||||||
- port: 8089
|
|
||||||
targetPort: 8089
|
|
||||||
type: ClusterIP # Keep internal, use port-forward to access
|
|
|
@ -1,78 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Locust load testing script for Llama Stack with Prism mock OpenAI provider.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import random
|
|
||||||
from locust import HttpUser, task, between
|
|
||||||
import os
|
|
||||||
|
|
||||||
base_path = os.getenv("LOCUST_BASE_PATH", "/v1/openai/v1")
|
|
||||||
|
|
||||||
MODEL_ID = os.getenv("INFERENCE_MODEL")
|
|
||||||
|
|
||||||
class LlamaStackUser(HttpUser):
|
|
||||||
wait_time = between(0.0, 0.0001)
|
|
||||||
|
|
||||||
def on_start(self):
|
|
||||||
"""Setup authentication and test data."""
|
|
||||||
# No auth required for benchmark server
|
|
||||||
self.headers = {
|
|
||||||
"Content-Type": "application/json"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Test messages of varying lengths
|
|
||||||
self.test_messages = [
|
|
||||||
[{"role": "user", "content": "Hi"}],
|
|
||||||
[{"role": "user", "content": "What is the capital of France?"}],
|
|
||||||
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
|
||||||
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
|
||||||
[
|
|
||||||
{"role": "user", "content": "What is machine learning?"},
|
|
||||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
|
||||||
{"role": "user", "content": "Can you give me a practical example?"}
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
@task(weight=100)
|
|
||||||
def chat_completion_streaming(self):
|
|
||||||
"""Test streaming chat completion (20% of requests)."""
|
|
||||||
messages = random.choice(self.test_messages)
|
|
||||||
payload = {
|
|
||||||
"model": MODEL_ID,
|
|
||||||
"messages": messages,
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 100
|
|
||||||
}
|
|
||||||
|
|
||||||
with self.client.post(
|
|
||||||
f"{base_path}/chat/completions",
|
|
||||||
headers=self.headers,
|
|
||||||
json=payload,
|
|
||||||
stream=True,
|
|
||||||
catch_response=True
|
|
||||||
) as response:
|
|
||||||
if response.status_code == 200:
|
|
||||||
chunks_received = 0
|
|
||||||
try:
|
|
||||||
for line in response.iter_lines():
|
|
||||||
if line:
|
|
||||||
line_str = line.decode('utf-8')
|
|
||||||
if line_str.startswith('data: '):
|
|
||||||
chunks_received += 1
|
|
||||||
if line_str.strip() == 'data: [DONE]':
|
|
||||||
break
|
|
||||||
|
|
||||||
if chunks_received > 0:
|
|
||||||
response.success()
|
|
||||||
else:
|
|
||||||
response.failure("No streaming chunks received")
|
|
||||||
except Exception as e:
|
|
||||||
response.failure(f"Streaming error: {e}")
|
|
||||||
else:
|
|
||||||
response.failure(f"HTTP {response.status_code}: {response.text}")
|
|
|
@ -1,52 +0,0 @@
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: openai-mock
|
|
||||||
labels:
|
|
||||||
app: openai-mock
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app: openai-mock
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: openai-mock
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: openai-mock
|
|
||||||
image: python:3.12-slim
|
|
||||||
ports:
|
|
||||||
- containerPort: ${MOCK_INFERENCE_PORT}
|
|
||||||
env:
|
|
||||||
- name: PORT
|
|
||||||
value: "${MOCK_INFERENCE_PORT}"
|
|
||||||
- name: MOCK_MODELS
|
|
||||||
value: "${MOCK_INFERENCE_MODEL}"
|
|
||||||
- name: STREAM_DELAY_SECONDS
|
|
||||||
value: "${STREAM_DELAY_SECONDS}"
|
|
||||||
command: ["sh", "-c"]
|
|
||||||
args:
|
|
||||||
- |
|
|
||||||
pip install flask &&
|
|
||||||
python /app/openai-mock-server.py --port ${MOCK_INFERENCE_PORT}
|
|
||||||
volumeMounts:
|
|
||||||
- name: openai-mock-script
|
|
||||||
mountPath: /app
|
|
||||||
volumes:
|
|
||||||
- name: openai-mock-script
|
|
||||||
configMap:
|
|
||||||
name: openai-mock
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: openai-mock-service
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app: openai-mock
|
|
||||||
ports:
|
|
||||||
- port: 8080
|
|
||||||
targetPort: 8080
|
|
||||||
type: ClusterIP
|
|
6
docs/source/distributions/k8s-benchmark/openai-mock-server.py
Normal file → Executable file
6
docs/source/distributions/k8s-benchmark/openai-mock-server.py
Normal file → Executable file
|
@ -23,7 +23,7 @@ app = Flask(__name__)
|
||||||
|
|
||||||
# Models from environment variables
|
# Models from environment variables
|
||||||
def get_models():
|
def get_models():
|
||||||
models_str = os.getenv("MOCK_MODELS", "mock-inference")
|
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
||||||
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
@ -49,13 +49,13 @@ def generate_random_text(length=50):
|
||||||
]
|
]
|
||||||
return " ".join(random.choices(words, k=length))
|
return " ".join(random.choices(words, k=length))
|
||||||
|
|
||||||
@app.route('/models', methods=['GET'])
|
@app.route('/v1/models', methods=['GET'])
|
||||||
def list_models():
|
def list_models():
|
||||||
models = get_models()
|
models = get_models()
|
||||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
||||||
return jsonify(models)
|
return jsonify(models)
|
||||||
|
|
||||||
@app.route('/chat/completions', methods=['POST'])
|
@app.route('/v1/chat/completions', methods=['POST'])
|
||||||
def chat_completions():
|
def chat_completions():
|
||||||
"""Return OpenAI-formatted chat completion responses."""
|
"""Return OpenAI-formatted chat completion responses."""
|
||||||
data = request.get_json()
|
data = request.get_json()
|
||||||
|
|
52
docs/source/distributions/k8s-benchmark/profile_running_server.sh
Executable file
52
docs/source/distributions/k8s-benchmark/profile_running_server.sh
Executable file
|
@ -0,0 +1,52 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
# Script to profile an already running Llama Stack server
|
||||||
|
# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
|
||||||
|
|
||||||
|
DURATION=${1:-60} # Default 60 seconds
|
||||||
|
OUTPUT_FILE=${2:-"llama_stack_profile"} # Default output file
|
||||||
|
|
||||||
|
echo "Looking for running Llama Stack server..."
|
||||||
|
|
||||||
|
# Find the server PID
|
||||||
|
SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
|
||||||
|
|
||||||
|
|
||||||
|
if [ -z "$SERVER_PID" ]; then
|
||||||
|
echo "Error: No running Llama Stack server found"
|
||||||
|
echo "Please start your server first with:"
|
||||||
|
echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Found Llama Stack server with PID: $SERVER_PID"
|
||||||
|
|
||||||
|
# Start py-spy profiling
|
||||||
|
echo "Starting py-spy profiling for ${DURATION} seconds..."
|
||||||
|
echo "Output will be saved to: ${OUTPUT_FILE}.svg"
|
||||||
|
echo ""
|
||||||
|
echo "You can now run your load test..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Get the full path to py-spy
|
||||||
|
PYSPY_PATH=$(which py-spy)
|
||||||
|
|
||||||
|
# Check if running as root, if not, use sudo
|
||||||
|
if [ "$EUID" -ne 0 ]; then
|
||||||
|
echo "py-spy requires root permissions on macOS. Running with sudo..."
|
||||||
|
sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
||||||
|
else
|
||||||
|
"$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
|
||||||
|
echo ""
|
||||||
|
echo "To view the flame graph:"
|
||||||
|
echo "open ${OUTPUT_FILE}.svg"
|
148
docs/source/distributions/k8s-benchmark/run-benchmark.sh
Executable file
148
docs/source/distributions/k8s-benchmark/run-benchmark.sh
Executable file
|
@ -0,0 +1,148 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
TARGET="stack"
|
||||||
|
DURATION=60
|
||||||
|
CONCURRENT=10
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
usage() {
|
||||||
|
echo "Usage: $0 [options]"
|
||||||
|
echo "Options:"
|
||||||
|
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
|
||||||
|
echo " -d, --duration <seconds> Duration in seconds (default: 60)"
|
||||||
|
echo " -c, --concurrent <users> Number of concurrent users (default: 10)"
|
||||||
|
echo " -h, --help Show this help message"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " $0 --target vllm # Benchmark vLLM direct"
|
||||||
|
echo " $0 --target stack # Benchmark Llama Stack (default)"
|
||||||
|
echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users"
|
||||||
|
}
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
-t|--target)
|
||||||
|
TARGET="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-d|--duration)
|
||||||
|
DURATION="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-c|--concurrent)
|
||||||
|
CONCURRENT="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-h|--help)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option: $1"
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Validate target
|
||||||
|
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
|
||||||
|
echo "Error: Target must be 'stack' or 'vllm'"
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Set configuration based on target
|
||||||
|
if [[ "$TARGET" == "vllm" ]]; then
|
||||||
|
BASE_URL="http://vllm-server:8000/v1"
|
||||||
|
JOB_NAME="vllm-benchmark-job"
|
||||||
|
echo "Benchmarking vLLM direct..."
|
||||||
|
else
|
||||||
|
BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
|
||||||
|
JOB_NAME="stack-benchmark-job"
|
||||||
|
echo "Benchmarking Llama Stack..."
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Configuration:"
|
||||||
|
echo " Target: $TARGET"
|
||||||
|
echo " Base URL: $BASE_URL"
|
||||||
|
echo " Duration: ${DURATION}s"
|
||||||
|
echo " Concurrent users: $CONCURRENT"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Create temporary job yaml
|
||||||
|
TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
|
||||||
|
cat > "$TEMP_YAML" << EOF
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: Job
|
||||||
|
metadata:
|
||||||
|
name: $JOB_NAME
|
||||||
|
namespace: default
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: benchmark
|
||||||
|
image: python:3.11-slim
|
||||||
|
command: ["/bin/bash"]
|
||||||
|
args:
|
||||||
|
- "-c"
|
||||||
|
- |
|
||||||
|
pip install aiohttp &&
|
||||||
|
python3 /benchmark/benchmark.py \\
|
||||||
|
--base-url $BASE_URL \\
|
||||||
|
--model \${INFERENCE_MODEL} \\
|
||||||
|
--duration $DURATION \\
|
||||||
|
--concurrent $CONCURRENT
|
||||||
|
env:
|
||||||
|
- name: INFERENCE_MODEL
|
||||||
|
value: "meta-llama/Llama-3.2-3B-Instruct"
|
||||||
|
volumeMounts:
|
||||||
|
- name: benchmark-script
|
||||||
|
mountPath: /benchmark
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "256Mi"
|
||||||
|
cpu: "250m"
|
||||||
|
limits:
|
||||||
|
memory: "512Mi"
|
||||||
|
cpu: "500m"
|
||||||
|
volumes:
|
||||||
|
- name: benchmark-script
|
||||||
|
configMap:
|
||||||
|
name: benchmark-script
|
||||||
|
restartPolicy: Never
|
||||||
|
backoffLimit: 3
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "Creating benchmark ConfigMap..."
|
||||||
|
kubectl create configmap benchmark-script \
|
||||||
|
--from-file=benchmark.py=benchmark.py \
|
||||||
|
--dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
|
||||||
|
echo "Cleaning up any existing benchmark job..."
|
||||||
|
kubectl delete job $JOB_NAME 2>/dev/null || true
|
||||||
|
|
||||||
|
echo "Deploying benchmark Job..."
|
||||||
|
kubectl apply -f "$TEMP_YAML"
|
||||||
|
|
||||||
|
echo "Waiting for job to start..."
|
||||||
|
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
|
||||||
|
|
||||||
|
echo "Following benchmark logs..."
|
||||||
|
kubectl logs -f job/$JOB_NAME
|
||||||
|
|
||||||
|
echo "Job completed. Checking final status..."
|
||||||
|
kubectl get job $JOB_NAME
|
||||||
|
|
||||||
|
# Clean up temporary file
|
||||||
|
rm -f "$TEMP_YAML"
|
|
@ -26,13 +26,6 @@ data:
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||||
- provider_id: mock-vllm-inference
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
|
|
||||||
max_tokens: 4096
|
|
||||||
api_token: fake
|
|
||||||
tls_verify: false
|
|
||||||
- provider_id: sentence-transformers
|
- provider_id: sentence-transformers
|
||||||
provider_type: inline::sentence-transformers
|
provider_type: inline::sentence-transformers
|
||||||
config: {}
|
config: {}
|
||||||
|
@ -121,9 +114,6 @@ data:
|
||||||
- model_id: ${env.SAFETY_MODEL}
|
- model_id: ${env.SAFETY_MODEL}
|
||||||
provider_id: vllm-safety
|
provider_id: vllm-safety
|
||||||
model_type: llm
|
model_type: llm
|
||||||
- model_id: ${env.MOCK_INFERENCE_MODEL}
|
|
||||||
provider_id: mock-vllm-inference
|
|
||||||
model_type: llm
|
|
||||||
shields:
|
shields:
|
||||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
|
|
|
@ -44,8 +44,6 @@ spec:
|
||||||
value: "${SAFETY_MODEL}"
|
value: "${SAFETY_MODEL}"
|
||||||
- name: TAVILY_SEARCH_API_KEY
|
- name: TAVILY_SEARCH_API_KEY
|
||||||
value: "${TAVILY_SEARCH_API_KEY}"
|
value: "${TAVILY_SEARCH_API_KEY}"
|
||||||
- name: MOCK_INFERENCE_PORT
|
|
||||||
value: "${MOCK_INFERENCE_PORT}"
|
|
||||||
- name: VLLM_URL
|
- name: VLLM_URL
|
||||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
||||||
- name: VLLM_MAX_TOKENS
|
- name: VLLM_MAX_TOKENS
|
||||||
|
@ -54,8 +52,6 @@ spec:
|
||||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||||
- name: VLLM_TLS_VERIFY
|
- name: VLLM_TLS_VERIFY
|
||||||
value: "false"
|
value: "false"
|
||||||
- name: MOCK_INFERENCE_MODEL
|
|
||||||
value: "${MOCK_INFERENCE_MODEL}"
|
|
||||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8323
|
- containerPort: 8323
|
||||||
|
|
|
@ -3,7 +3,6 @@ image_name: kubernetes-benchmark-demo
|
||||||
apis:
|
apis:
|
||||||
- agents
|
- agents
|
||||||
- inference
|
- inference
|
||||||
- safety
|
|
||||||
- telemetry
|
- telemetry
|
||||||
- tool_runtime
|
- tool_runtime
|
||||||
- vector_io
|
- vector_io
|
||||||
|
@ -16,20 +15,6 @@ providers:
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||||
- provider_id: vllm-safety
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: mock-vllm-inference
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: http://openai-mock-service:${env.MOCK_INFERENCE_PORT}
|
|
||||||
max_tokens: 4096
|
|
||||||
api_token: fake
|
|
||||||
tls_verify: false
|
|
||||||
- provider_id: sentence-transformers
|
- provider_id: sentence-transformers
|
||||||
provider_type: inline::sentence-transformers
|
provider_type: inline::sentence-transformers
|
||||||
config: {}
|
config: {}
|
||||||
|
@ -45,11 +30,6 @@ providers:
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
db: ${env.POSTGRES_DB:=llamastack}
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
user: ${env.POSTGRES_USER:=llamastack}
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
||||||
safety:
|
|
||||||
- provider_id: llama-guard
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
excluded_categories: []
|
|
||||||
agents:
|
agents:
|
||||||
- provider_id: meta-reference
|
- provider_id: meta-reference
|
||||||
provider_type: inline::meta-reference
|
provider_type: inline::meta-reference
|
||||||
|
@ -115,14 +95,6 @@ models:
|
||||||
- model_id: ${env.INFERENCE_MODEL}
|
- model_id: ${env.INFERENCE_MODEL}
|
||||||
provider_id: vllm-inference
|
provider_id: vllm-inference
|
||||||
model_type: llm
|
model_type: llm
|
||||||
- model_id: ${env.SAFETY_MODEL}
|
|
||||||
provider_id: vllm-safety
|
|
||||||
model_type: llm
|
|
||||||
- model_id: ${env.MOCK_INFERENCE_MODEL}
|
|
||||||
provider_id: mock-vllm-inference
|
|
||||||
model_type: llm
|
|
||||||
shields:
|
|
||||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
|
|
|
@ -2,6 +2,15 @@
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
|
Agents API for creating and interacting with agentic systems.
|
||||||
|
|
||||||
|
Main functionalities provided by this API:
|
||||||
|
- Create agents with specific instructions and ability to use tools.
|
||||||
|
- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
|
||||||
|
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
|
||||||
|
- Agents can be provided with various shields (see the Safety API for more details).
|
||||||
|
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **agents** API.
|
This section contains documentation for all available providers for the **agents** API.
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
24
docs/source/providers/batches/index.md
Normal file
24
docs/source/providers/batches/index.md
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
# Batches
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Batches API enables efficient processing of multiple requests in a single operation,
|
||||||
|
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||||
|
cost-effective inference at scale.
|
||||||
|
|
||||||
|
The API is designed to allow use of openai client libraries for seamless integration.
|
||||||
|
|
||||||
|
This API provides the following extensions:
|
||||||
|
- idempotent batch creation
|
||||||
|
|
||||||
|
Note: This API is currently under active development and may undergo changes.
|
||||||
|
|
||||||
|
This section contains documentation for all available providers for the **batches** API.
|
||||||
|
|
||||||
|
## Providers
|
||||||
|
|
||||||
|
```{toctree}
|
||||||
|
:maxdepth: 1
|
||||||
|
|
||||||
|
inline_reference
|
||||||
|
```
|
23
docs/source/providers/batches/inline_reference.md
Normal file
23
docs/source/providers/batches/inline_reference.md
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# inline::reference
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
Reference implementation of batches API with KVStore persistence.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
| Field | Type | Required | Default | Description |
|
||||||
|
|-------|------|----------|---------|-------------|
|
||||||
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
|
||||||
|
| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
|
||||||
|
| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
|
||||||
|
|
||||||
|
## Sample Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
|
||||||
|
|
||||||
|
```
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
|
Llama Stack Evaluation API for running evaluations on model and agent candidates.
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **eval** API.
|
This section contains documentation for all available providers for the **eval** API.
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
|
@ -10,4 +10,5 @@ This section contains documentation for all available providers for the **files*
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
inline_localfs
|
inline_localfs
|
||||||
|
remote_s3
|
||||||
```
|
```
|
||||||
|
|
33
docs/source/providers/files/remote_s3.md
Normal file
33
docs/source/providers/files/remote_s3.md
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
# remote::s3
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
AWS S3-based file storage provider for scalable cloud file management with metadata persistence.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
| Field | Type | Required | Default | Description |
|
||||||
|
|-------|------|----------|---------|-------------|
|
||||||
|
| `bucket_name` | `<class 'str'>` | No | | S3 bucket name to store files |
|
||||||
|
| `region` | `<class 'str'>` | No | us-east-1 | AWS region where the bucket is located |
|
||||||
|
| `aws_access_key_id` | `str \| None` | No | | AWS access key ID (optional if using IAM roles) |
|
||||||
|
| `aws_secret_access_key` | `str \| None` | No | | AWS secret access key (optional if using IAM roles) |
|
||||||
|
| `endpoint_url` | `str \| None` | No | | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
|
||||||
|
| `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
|
||||||
|
| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
|
||||||
|
|
||||||
|
## Sample Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
bucket_name: ${env.S3_BUCKET_NAME}
|
||||||
|
region: ${env.AWS_REGION:=us-east-1}
|
||||||
|
aws_access_key_id: ${env.AWS_ACCESS_KEY_ID:=}
|
||||||
|
aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY:=}
|
||||||
|
endpoint_url: ${env.S3_ENDPOINT_URL:=}
|
||||||
|
auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false}
|
||||||
|
metadata_store:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/s3_files_metadata.db
|
||||||
|
|
||||||
|
```
|
||||||
|
|
|
@ -2,6 +2,12 @@
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
|
Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
||||||
|
|
||||||
|
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
||||||
|
- LLM models: these models generate "raw" and "chat" (conversational) completions.
|
||||||
|
- Embedding models: these models generate embeddings to be used for semantic search.
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **inference** API.
|
This section contains documentation for all available providers for the **inference** API.
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
|
@ -9,7 +9,9 @@ This section contains documentation for all available providers for the **post_t
|
||||||
```{toctree}
|
```{toctree}
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
||||||
inline_huggingface
|
inline_huggingface-cpu
|
||||||
inline_torchtune
|
inline_huggingface-gpu
|
||||||
|
inline_torchtune-cpu
|
||||||
|
inline_torchtune-gpu
|
||||||
remote_nvidia
|
remote_nvidia
|
||||||
```
|
```
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
# inline::huggingface-cpu
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
| Field | Type | Required | Default | Description |
|
||||||
|
|-------|------|----------|---------|-------------|
|
||||||
|
| `device` | `<class 'str'>` | No | cuda | |
|
||||||
|
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
|
||||||
|
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
|
||||||
|
| `chat_template` | `<class 'str'>` | No | <|user|>
|
||||||
|
{input}
|
||||||
|
<|assistant|>
|
||||||
|
{output} | |
|
||||||
|
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
|
||||||
|
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
|
||||||
|
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
|
||||||
|
| `save_total_limit` | `<class 'int'>` | No | 3 | |
|
||||||
|
| `logging_steps` | `<class 'int'>` | No | 10 | |
|
||||||
|
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
|
||||||
|
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
|
||||||
|
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
|
||||||
|
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
|
||||||
|
| `dpo_beta` | `<class 'float'>` | No | 0.1 | |
|
||||||
|
| `use_reference_model` | `<class 'bool'>` | No | True | |
|
||||||
|
| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | |
|
||||||
|
| `dpo_output_dir` | `<class 'str'>` | No | | |
|
||||||
|
|
||||||
|
## Sample Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
checkpoint_format: huggingface
|
||||||
|
distributed_backend: null
|
||||||
|
device: cpu
|
||||||
|
dpo_output_dir: ~/.llama/dummy/dpo_output
|
||||||
|
|
||||||
|
```
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
# inline::huggingface-gpu
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
| Field | Type | Required | Default | Description |
|
||||||
|
|-------|------|----------|---------|-------------|
|
||||||
|
| `device` | `<class 'str'>` | No | cuda | |
|
||||||
|
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
|
||||||
|
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
|
||||||
|
| `chat_template` | `<class 'str'>` | No | <|user|>
|
||||||
|
{input}
|
||||||
|
<|assistant|>
|
||||||
|
{output} | |
|
||||||
|
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
|
||||||
|
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
|
||||||
|
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
|
||||||
|
| `save_total_limit` | `<class 'int'>` | No | 3 | |
|
||||||
|
| `logging_steps` | `<class 'int'>` | No | 10 | |
|
||||||
|
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
|
||||||
|
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
|
||||||
|
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
|
||||||
|
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
|
||||||
|
| `dpo_beta` | `<class 'float'>` | No | 0.1 | |
|
||||||
|
| `use_reference_model` | `<class 'bool'>` | No | True | |
|
||||||
|
| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | |
|
||||||
|
| `dpo_output_dir` | `<class 'str'>` | No | | |
|
||||||
|
|
||||||
|
## Sample Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
checkpoint_format: huggingface
|
||||||
|
distributed_backend: null
|
||||||
|
device: cpu
|
||||||
|
dpo_output_dir: ~/.llama/dummy/dpo_output
|
||||||
|
|
||||||
|
```
|
||||||
|
|
20
docs/source/providers/post_training/inline_torchtune-cpu.md
Normal file
20
docs/source/providers/post_training/inline_torchtune-cpu.md
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# inline::torchtune-cpu
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
| Field | Type | Required | Default | Description |
|
||||||
|
|-------|------|----------|---------|-------------|
|
||||||
|
| `torch_seed` | `int \| None` | No | | |
|
||||||
|
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
|
||||||
|
|
||||||
|
## Sample Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
checkpoint_format: meta
|
||||||
|
|
||||||
|
```
|
||||||
|
|
20
docs/source/providers/post_training/inline_torchtune-gpu.md
Normal file
20
docs/source/providers/post_training/inline_torchtune-gpu.md
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# inline::torchtune-gpu
|
||||||
|
|
||||||
|
## Description
|
||||||
|
|
||||||
|
TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
| Field | Type | Required | Default | Description |
|
||||||
|
|-------|------|----------|---------|-------------|
|
||||||
|
| `torch_seed` | `int \| None` | No | | |
|
||||||
|
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
|
||||||
|
|
||||||
|
## Sample Configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
checkpoint_format: meta
|
||||||
|
|
||||||
|
```
|
||||||
|
|
|
@ -623,6 +623,62 @@ class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
|
||||||
type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
|
type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIResponseContentPartOutputText(BaseModel):
|
||||||
|
type: Literal["output_text"] = "output_text"
|
||||||
|
text: str
|
||||||
|
# TODO: add annotations, logprobs, etc.
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIResponseContentPartRefusal(BaseModel):
|
||||||
|
type: Literal["refusal"] = "refusal"
|
||||||
|
refusal: str
|
||||||
|
|
||||||
|
|
||||||
|
OpenAIResponseContentPart = Annotated[
|
||||||
|
OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal,
|
||||||
|
Field(discriminator="type"),
|
||||||
|
]
|
||||||
|
register_schema(OpenAIResponseContentPart, name="OpenAIResponseContentPart")
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIResponseObjectStreamResponseContentPartAdded(BaseModel):
|
||||||
|
"""Streaming event for when a new content part is added to a response item.
|
||||||
|
|
||||||
|
:param response_id: Unique identifier of the response containing this content
|
||||||
|
:param item_id: Unique identifier of the output item containing this content part
|
||||||
|
:param part: The content part that was added
|
||||||
|
:param sequence_number: Sequential number for ordering streaming events
|
||||||
|
:param type: Event type identifier, always "response.content_part.added"
|
||||||
|
"""
|
||||||
|
|
||||||
|
response_id: str
|
||||||
|
item_id: str
|
||||||
|
part: OpenAIResponseContentPart
|
||||||
|
sequence_number: int
|
||||||
|
type: Literal["response.content_part.added"] = "response.content_part.added"
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class OpenAIResponseObjectStreamResponseContentPartDone(BaseModel):
|
||||||
|
"""Streaming event for when a content part is completed.
|
||||||
|
|
||||||
|
:param response_id: Unique identifier of the response containing this content
|
||||||
|
:param item_id: Unique identifier of the output item containing this content part
|
||||||
|
:param part: The completed content part
|
||||||
|
:param sequence_number: Sequential number for ordering streaming events
|
||||||
|
:param type: Event type identifier, always "response.content_part.done"
|
||||||
|
"""
|
||||||
|
|
||||||
|
response_id: str
|
||||||
|
item_id: str
|
||||||
|
part: OpenAIResponseContentPart
|
||||||
|
sequence_number: int
|
||||||
|
type: Literal["response.content_part.done"] = "response.content_part.done"
|
||||||
|
|
||||||
|
|
||||||
OpenAIResponseObjectStream = Annotated[
|
OpenAIResponseObjectStream = Annotated[
|
||||||
OpenAIResponseObjectStreamResponseCreated
|
OpenAIResponseObjectStreamResponseCreated
|
||||||
| OpenAIResponseObjectStreamResponseOutputItemAdded
|
| OpenAIResponseObjectStreamResponseOutputItemAdded
|
||||||
|
@ -642,6 +698,8 @@ OpenAIResponseObjectStream = Annotated[
|
||||||
| OpenAIResponseObjectStreamResponseMcpCallInProgress
|
| OpenAIResponseObjectStreamResponseMcpCallInProgress
|
||||||
| OpenAIResponseObjectStreamResponseMcpCallFailed
|
| OpenAIResponseObjectStreamResponseMcpCallFailed
|
||||||
| OpenAIResponseObjectStreamResponseMcpCallCompleted
|
| OpenAIResponseObjectStreamResponseMcpCallCompleted
|
||||||
|
| OpenAIResponseObjectStreamResponseContentPartAdded
|
||||||
|
| OpenAIResponseObjectStreamResponseContentPartDone
|
||||||
| OpenAIResponseObjectStreamResponseCompleted,
|
| OpenAIResponseObjectStreamResponseCompleted,
|
||||||
Field(discriminator="type"),
|
Field(discriminator="type"),
|
||||||
]
|
]
|
||||||
|
|
9
llama_stack/apis/batches/__init__.py
Normal file
9
llama_stack/apis/batches/__init__.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from .batches import Batches, BatchObject, ListBatchesResponse
|
||||||
|
|
||||||
|
__all__ = ["Batches", "BatchObject", "ListBatchesResponse"]
|
95
llama_stack/apis/batches/batches.py
Normal file
95
llama_stack/apis/batches/batches.py
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from typing import Literal, Protocol, runtime_checkable
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from llama_stack.schema_utils import json_schema_type, webmethod
|
||||||
|
|
||||||
|
try:
|
||||||
|
from openai.types import Batch as BatchObject
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError("OpenAI package is required for batches API. Please install it with: pip install openai") from e
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class ListBatchesResponse(BaseModel):
|
||||||
|
"""Response containing a list of batch objects."""
|
||||||
|
|
||||||
|
object: Literal["list"] = "list"
|
||||||
|
data: list[BatchObject] = Field(..., description="List of batch objects")
|
||||||
|
first_id: str | None = Field(default=None, description="ID of the first batch in the list")
|
||||||
|
last_id: str | None = Field(default=None, description="ID of the last batch in the list")
|
||||||
|
has_more: bool = Field(default=False, description="Whether there are more batches available")
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class Batches(Protocol):
|
||||||
|
"""
|
||||||
|
The Batches API enables efficient processing of multiple requests in a single operation,
|
||||||
|
particularly useful for processing large datasets, batch evaluation workflows, and
|
||||||
|
cost-effective inference at scale.
|
||||||
|
|
||||||
|
The API is designed to allow use of openai client libraries for seamless integration.
|
||||||
|
|
||||||
|
This API provides the following extensions:
|
||||||
|
- idempotent batch creation
|
||||||
|
|
||||||
|
Note: This API is currently under active development and may undergo changes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@webmethod(route="/openai/v1/batches", method="POST")
|
||||||
|
async def create_batch(
|
||||||
|
self,
|
||||||
|
input_file_id: str,
|
||||||
|
endpoint: str,
|
||||||
|
completion_window: Literal["24h"],
|
||||||
|
metadata: dict[str, str] | None = None,
|
||||||
|
idempotency_key: str | None = None,
|
||||||
|
) -> BatchObject:
|
||||||
|
"""Create a new batch for processing multiple API requests.
|
||||||
|
|
||||||
|
:param input_file_id: The ID of an uploaded file containing requests for the batch.
|
||||||
|
:param endpoint: The endpoint to be used for all requests in the batch.
|
||||||
|
:param completion_window: The time window within which the batch should be processed.
|
||||||
|
:param metadata: Optional metadata for the batch.
|
||||||
|
:param idempotency_key: Optional idempotency key. When provided, enables idempotent behavior.
|
||||||
|
:returns: The created batch object.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
@webmethod(route="/openai/v1/batches/{batch_id}", method="GET")
|
||||||
|
async def retrieve_batch(self, batch_id: str) -> BatchObject:
|
||||||
|
"""Retrieve information about a specific batch.
|
||||||
|
|
||||||
|
:param batch_id: The ID of the batch to retrieve.
|
||||||
|
:returns: The batch object.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
@webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST")
|
||||||
|
async def cancel_batch(self, batch_id: str) -> BatchObject:
|
||||||
|
"""Cancel a batch that is in progress.
|
||||||
|
|
||||||
|
:param batch_id: The ID of the batch to cancel.
|
||||||
|
:returns: The updated batch object.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
@webmethod(route="/openai/v1/batches", method="GET")
|
||||||
|
async def list_batches(
|
||||||
|
self,
|
||||||
|
after: str | None = None,
|
||||||
|
limit: int = 20,
|
||||||
|
) -> ListBatchesResponse:
|
||||||
|
"""List all batches for the current user.
|
||||||
|
|
||||||
|
:param after: A cursor for pagination; returns batches after this batch ID.
|
||||||
|
:param limit: Number of batches to return (default 20, max 100).
|
||||||
|
:returns: A list of batch objects.
|
||||||
|
"""
|
||||||
|
...
|
|
@ -72,3 +72,10 @@ class ModelTypeError(TypeError):
|
||||||
f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
|
f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
|
||||||
)
|
)
|
||||||
super().__init__(message)
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
|
class ConflictError(ValueError):
|
||||||
|
"""raised when an operation cannot be performed due to a conflict with the current state"""
|
||||||
|
|
||||||
|
def __init__(self, message: str) -> None:
|
||||||
|
super().__init__(message)
|
||||||
|
|
|
@ -86,6 +86,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
|
||||||
:cvar inference: Text generation, chat completions, and embeddings
|
:cvar inference: Text generation, chat completions, and embeddings
|
||||||
:cvar safety: Content moderation and safety shields
|
:cvar safety: Content moderation and safety shields
|
||||||
:cvar agents: Agent orchestration and execution
|
:cvar agents: Agent orchestration and execution
|
||||||
|
:cvar batches: Batch processing for asynchronous API requests
|
||||||
:cvar vector_io: Vector database operations and queries
|
:cvar vector_io: Vector database operations and queries
|
||||||
:cvar datasetio: Dataset input/output operations
|
:cvar datasetio: Dataset input/output operations
|
||||||
:cvar scoring: Model output evaluation and scoring
|
:cvar scoring: Model output evaluation and scoring
|
||||||
|
@ -108,6 +109,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
|
||||||
inference = "inference"
|
inference = "inference"
|
||||||
safety = "safety"
|
safety = "safety"
|
||||||
agents = "agents"
|
agents = "agents"
|
||||||
|
batches = "batches"
|
||||||
vector_io = "vector_io"
|
vector_io = "vector_io"
|
||||||
datasetio = "datasetio"
|
datasetio = "datasetio"
|
||||||
scoring = "scoring"
|
scoring = "scoring"
|
||||||
|
|
|
@ -22,6 +22,7 @@ class OpenAIFilePurpose(StrEnum):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ASSISTANTS = "assistants"
|
ASSISTANTS = "assistants"
|
||||||
|
BATCH = "batch"
|
||||||
# TODO: Add other purposes as needed
|
# TODO: Add other purposes as needed
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -473,6 +473,28 @@ class EmbeddingsResponse(BaseModel):
|
||||||
embeddings: list[list[float]]
|
embeddings: list[list[float]]
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class RerankData(BaseModel):
|
||||||
|
"""A single rerank result from a reranking response.
|
||||||
|
|
||||||
|
:param index: The original index of the document in the input list
|
||||||
|
:param relevance_score: The relevance score from the model output. Values are inverted when applicable so that higher scores indicate greater relevance.
|
||||||
|
"""
|
||||||
|
|
||||||
|
index: int
|
||||||
|
relevance_score: float
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class RerankResponse(BaseModel):
|
||||||
|
"""Response from a reranking request.
|
||||||
|
|
||||||
|
:param data: List of rerank result objects, sorted by relevance score (descending)
|
||||||
|
"""
|
||||||
|
|
||||||
|
data: list[RerankData]
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class OpenAIChatCompletionContentPartTextParam(BaseModel):
|
class OpenAIChatCompletionContentPartTextParam(BaseModel):
|
||||||
"""Text content part for OpenAI-compatible chat completion messages.
|
"""Text content part for OpenAI-compatible chat completion messages.
|
||||||
|
@ -1046,6 +1068,7 @@ class InferenceProvider(Protocol):
|
||||||
:returns: A BatchCompletionResponse with the full completions.
|
:returns: A BatchCompletionResponse with the full completions.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError("Batch completion is not implemented")
|
raise NotImplementedError("Batch completion is not implemented")
|
||||||
|
return # this is so mypy's safe-super rule will consider the method concrete
|
||||||
|
|
||||||
@webmethod(route="/inference/chat-completion", method="POST")
|
@webmethod(route="/inference/chat-completion", method="POST")
|
||||||
async def chat_completion(
|
async def chat_completion(
|
||||||
|
@ -1110,6 +1133,7 @@ class InferenceProvider(Protocol):
|
||||||
:returns: A BatchChatCompletionResponse with the full completions.
|
:returns: A BatchChatCompletionResponse with the full completions.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError("Batch chat completion is not implemented")
|
raise NotImplementedError("Batch chat completion is not implemented")
|
||||||
|
return # this is so mypy's safe-super rule will consider the method concrete
|
||||||
|
|
||||||
@webmethod(route="/inference/embeddings", method="POST")
|
@webmethod(route="/inference/embeddings", method="POST")
|
||||||
async def embeddings(
|
async def embeddings(
|
||||||
|
@ -1131,6 +1155,25 @@ class InferenceProvider(Protocol):
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@webmethod(route="/inference/rerank", method="POST", experimental=True)
|
||||||
|
async def rerank(
|
||||||
|
self,
|
||||||
|
model: str,
|
||||||
|
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
|
||||||
|
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
|
||||||
|
max_num_results: int | None = None,
|
||||||
|
) -> RerankResponse:
|
||||||
|
"""Rerank a list of documents based on their relevance to a query.
|
||||||
|
|
||||||
|
:param model: The identifier of the reranking model to use.
|
||||||
|
:param query: The search query to rank items against. Can be a string, text content part, or image content part. The input must not exceed the model's max input token length.
|
||||||
|
:param items: List of items to rerank. Each item can be a string, text content part, or image content part. Each input must not exceed the model's max input token length.
|
||||||
|
:param max_num_results: (Optional) Maximum number of results to return. Default: returns all.
|
||||||
|
:returns: RerankResponse with indices sorted by relevance score (descending).
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("Reranking is not implemented")
|
||||||
|
return # this is so mypy's safe-super rule will consider the method concrete
|
||||||
|
|
||||||
@webmethod(route="/openai/v1/completions", method="POST")
|
@webmethod(route="/openai/v1/completions", method="POST")
|
||||||
async def openai_completion(
|
async def openai_completion(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -386,6 +386,7 @@ class MetricDataPoint(BaseModel):
|
||||||
|
|
||||||
timestamp: int
|
timestamp: int
|
||||||
value: float
|
value: float
|
||||||
|
unit: str
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
@ -518,7 +519,7 @@ class Telemetry(Protocol):
|
||||||
metric_name: str,
|
metric_name: str,
|
||||||
start_time: int,
|
start_time: int,
|
||||||
end_time: int | None = None,
|
end_time: int | None = None,
|
||||||
granularity: str | None = "1d",
|
granularity: str | None = None,
|
||||||
query_type: MetricQueryType = MetricQueryType.RANGE,
|
query_type: MetricQueryType = MetricQueryType.RANGE,
|
||||||
label_matchers: list[MetricLabelMatcher] | None = None,
|
label_matchers: list[MetricLabelMatcher] | None = None,
|
||||||
) -> QueryMetricsResponse:
|
) -> QueryMetricsResponse:
|
||||||
|
|
|
@ -15,7 +15,7 @@ from llama_stack.log import get_logger
|
||||||
|
|
||||||
REPO_ROOT = Path(__file__).parent.parent.parent.parent
|
REPO_ROOT = Path(__file__).parent.parent.parent.parent
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="server")
|
logger = get_logger(name=__name__, category="cli")
|
||||||
|
|
||||||
|
|
||||||
class StackRun(Subcommand):
|
class StackRun(Subcommand):
|
||||||
|
|
|
@ -5,7 +5,6 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import importlib.resources
|
import importlib.resources
|
||||||
import logging
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
@ -17,9 +16,10 @@ from llama_stack.core.external import load_external_apis
|
||||||
from llama_stack.core.utils.exec import run_command
|
from llama_stack.core.utils.exec import run_command
|
||||||
from llama_stack.core.utils.image_types import LlamaStackImageType
|
from llama_stack.core.utils.image_types import LlamaStackImageType
|
||||||
from llama_stack.distributions.template import DistributionTemplate
|
from llama_stack.distributions.template import DistributionTemplate
|
||||||
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import Api
|
from llama_stack.providers.datatypes import Api
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = get_logger(name=__name__, category="core")
|
||||||
|
|
||||||
# These are the dependencies needed by the distribution server.
|
# These are the dependencies needed by the distribution server.
|
||||||
# `llama-stack` is automatically installed by the installation script.
|
# `llama-stack` is automatically installed by the installation script.
|
||||||
|
|
|
@ -1,207 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
|
|
||||||
LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
|
|
||||||
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
|
|
||||||
PYPI_VERSION=${PYPI_VERSION:-}
|
|
||||||
# This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
|
|
||||||
# Reference: https://github.com/astral-sh/uv/pull/1694
|
|
||||||
UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Define color codes
|
|
||||||
RED='\033[0;31m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
NC='\033[0m' # No Color
|
|
||||||
|
|
||||||
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
|
|
||||||
source "$SCRIPT_DIR/common.sh"
|
|
||||||
|
|
||||||
# Usage function
|
|
||||||
usage() {
|
|
||||||
echo "Usage: $0 --env-name <conda_env_name> --build-file-path <build_file_path> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
|
|
||||||
echo "Example: $0 --env-name my-conda-env --build-file-path ./my-stack-build.yaml --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# Parse arguments
|
|
||||||
env_name=""
|
|
||||||
build_file_path=""
|
|
||||||
normal_deps=""
|
|
||||||
external_provider_deps=""
|
|
||||||
optional_deps=""
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
key="$1"
|
|
||||||
case "$key" in
|
|
||||||
--env-name)
|
|
||||||
if [[ -z "$2" || "$2" == --* ]]; then
|
|
||||||
echo "Error: --env-name requires a string value" >&2
|
|
||||||
usage
|
|
||||||
fi
|
|
||||||
env_name="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--build-file-path)
|
|
||||||
if [[ -z "$2" || "$2" == --* ]]; then
|
|
||||||
echo "Error: --build-file-path requires a string value" >&2
|
|
||||||
usage
|
|
||||||
fi
|
|
||||||
build_file_path="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--normal-deps)
|
|
||||||
if [[ -z "$2" || "$2" == --* ]]; then
|
|
||||||
echo "Error: --normal-deps requires a string value" >&2
|
|
||||||
usage
|
|
||||||
fi
|
|
||||||
normal_deps="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--external-provider-deps)
|
|
||||||
if [[ -z "$2" || "$2" == --* ]]; then
|
|
||||||
echo "Error: --external-provider-deps requires a string value" >&2
|
|
||||||
usage
|
|
||||||
fi
|
|
||||||
external_provider_deps="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
--optional-deps)
|
|
||||||
if [[ -z "$2" || "$2" == --* ]]; then
|
|
||||||
echo "Error: --optional-deps requires a string value" >&2
|
|
||||||
usage
|
|
||||||
fi
|
|
||||||
optional_deps="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1" >&2
|
|
||||||
usage
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# Check required arguments
|
|
||||||
if [[ -z "$env_name" || -z "$build_file_path" || -z "$normal_deps" ]]; then
|
|
||||||
echo "Error: --env-name, --build-file-path, and --normal-deps are required." >&2
|
|
||||||
usage
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -n "$LLAMA_STACK_DIR" ]; then
|
|
||||||
echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
|
|
||||||
fi
|
|
||||||
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
|
|
||||||
echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
|
|
||||||
fi
|
|
||||||
|
|
||||||
ensure_conda_env_python310() {
|
|
||||||
# Use only global variables set by flag parser
|
|
||||||
local python_version="3.12"
|
|
||||||
|
|
||||||
if ! is_command_available conda; then
|
|
||||||
printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if conda env list | grep -q "^${env_name} "; then
|
|
||||||
printf "Conda environment '${env_name}' exists. Checking Python version...\n"
|
|
||||||
current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
|
|
||||||
if [ "$current_version" = "$python_version" ]; then
|
|
||||||
printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
|
|
||||||
else
|
|
||||||
printf "Updating environment '${env_name}' to Python ${python_version}...\n"
|
|
||||||
conda install -n "${env_name}" python="${python_version}" -y
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
|
|
||||||
conda create -n "${env_name}" python="${python_version}" -y
|
|
||||||
fi
|
|
||||||
|
|
||||||
eval "$(conda shell.bash hook)"
|
|
||||||
conda deactivate && conda activate "${env_name}"
|
|
||||||
"$CONDA_PREFIX"/bin/pip install uv
|
|
||||||
|
|
||||||
if [ -n "$TEST_PYPI_VERSION" ]; then
|
|
||||||
uv pip install fastapi libcst
|
|
||||||
uv pip install --extra-index-url https://test.pypi.org/simple/ \
|
|
||||||
llama-stack=="$TEST_PYPI_VERSION" \
|
|
||||||
"$normal_deps"
|
|
||||||
if [ -n "$optional_deps" ]; then
|
|
||||||
IFS='#' read -ra parts <<<"$optional_deps"
|
|
||||||
for part in "${parts[@]}"; do
|
|
||||||
echo "$part"
|
|
||||||
uv pip install $part
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
if [ -n "$external_provider_deps" ]; then
|
|
||||||
IFS='#' read -ra parts <<<"$external_provider_deps"
|
|
||||||
for part in "${parts[@]}"; do
|
|
||||||
echo "$part"
|
|
||||||
uv pip install "$part"
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [ -n "$LLAMA_STACK_DIR" ]; then
|
|
||||||
if [ ! -d "$LLAMA_STACK_DIR" ]; then
|
|
||||||
printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}\n" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
printf "Installing from LLAMA_STACK_DIR: $LLAMA_STACK_DIR\n"
|
|
||||||
uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
|
|
||||||
else
|
|
||||||
PYPI_VERSION="${PYPI_VERSION:-}"
|
|
||||||
if [ -n "$PYPI_VERSION" ]; then
|
|
||||||
SPEC_VERSION="llama-stack==${PYPI_VERSION}"
|
|
||||||
else
|
|
||||||
SPEC_VERSION="llama-stack"
|
|
||||||
fi
|
|
||||||
uv pip install --no-cache-dir "$SPEC_VERSION"
|
|
||||||
fi
|
|
||||||
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
|
|
||||||
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
|
|
||||||
printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
|
|
||||||
uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
|
|
||||||
fi
|
|
||||||
printf "Installing pip dependencies\n"
|
|
||||||
uv pip install $normal_deps
|
|
||||||
if [ -n "$optional_deps" ]; then
|
|
||||||
IFS='#' read -ra parts <<<"$optional_deps"
|
|
||||||
for part in "${parts[@]}"; do
|
|
||||||
echo "$part"
|
|
||||||
uv pip install $part
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
if [ -n "$external_provider_deps" ]; then
|
|
||||||
IFS='#' read -ra parts <<<"$external_provider_deps"
|
|
||||||
for part in "${parts[@]}"; do
|
|
||||||
echo "Getting provider spec for module: $part and installing dependencies"
|
|
||||||
package_name=$(echo "$part" | sed 's/[<>=!].*//')
|
|
||||||
python3 -c "
|
|
||||||
import importlib
|
|
||||||
import sys
|
|
||||||
try:
|
|
||||||
module = importlib.import_module(f'$package_name.provider')
|
|
||||||
spec = module.get_provider_spec()
|
|
||||||
if hasattr(spec, 'pip_packages') and spec.pip_packages:
|
|
||||||
print('\\n'.join(spec.pip_packages))
|
|
||||||
except Exception as e:
|
|
||||||
print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
|
|
||||||
" | uv pip install -r -
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
mv "$build_file_path" "$CONDA_PREFIX"/llamastack-build.yaml
|
|
||||||
echo "Build spec configuration saved at $CONDA_PREFIX/llamastack-build.yaml"
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_conda_env_python310 "$env_name" "$build_file_path" "$normal_deps" "$optional_deps" "$external_provider_deps"
|
|
|
@ -151,23 +151,37 @@ run() {
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
if [ -n "$LLAMA_STACK_DIR" ]; then
|
if [ -n "$LLAMA_STACK_DIR" ]; then
|
||||||
if [ ! -d "$LLAMA_STACK_DIR" ]; then
|
# only warn if DIR does not start with "git+"
|
||||||
|
if [ ! -d "$LLAMA_STACK_DIR" ] && [[ "$LLAMA_STACK_DIR" != git+* ]]; then
|
||||||
printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
|
printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
printf "Installing from LLAMA_STACK_DIR: %s\n" "$LLAMA_STACK_DIR"
|
printf "Installing from LLAMA_STACK_DIR: %s\n" "$LLAMA_STACK_DIR"
|
||||||
uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
|
# editable only if LLAMA_STACK_DIR does not start with "git+"
|
||||||
|
if [[ "$LLAMA_STACK_DIR" != git+* ]]; then
|
||||||
|
EDITABLE="-e"
|
||||||
|
else
|
||||||
|
EDITABLE=""
|
||||||
|
fi
|
||||||
|
uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_DIR"
|
||||||
else
|
else
|
||||||
uv pip install --no-cache-dir llama-stack
|
uv pip install --no-cache-dir llama-stack
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
|
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
|
||||||
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
|
# only warn if DIR does not start with "git+"
|
||||||
|
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ] && [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
|
||||||
printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
|
printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
|
printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
|
||||||
uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
|
# editable only if LLAMA_STACK_CLIENT_DIR does not start with "git+"
|
||||||
|
if [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
|
||||||
|
EDITABLE="-e"
|
||||||
|
else
|
||||||
|
EDITABLE=""
|
||||||
|
fi
|
||||||
|
uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_CLIENT_DIR"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
printf "Installing pip dependencies\n"
|
printf "Installing pip dependencies\n"
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
#
|
#
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
import logging
|
|
||||||
import textwrap
|
import textwrap
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
@ -21,9 +20,10 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
|
||||||
from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
|
from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
|
||||||
from llama_stack.core.utils.dynamic import instantiate_class_type
|
from llama_stack.core.utils.dynamic import instantiate_class_type
|
||||||
from llama_stack.core.utils.prompt_for_config import prompt_for_config
|
from llama_stack.core.utils.prompt_for_config import prompt_for_config
|
||||||
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import Api, ProviderSpec
|
from llama_stack.providers.datatypes import Api, ProviderSpec
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = get_logger(name=__name__, category="core")
|
||||||
|
|
||||||
|
|
||||||
def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:
|
def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:
|
||||||
|
|
|
@ -318,6 +318,41 @@ class QuotaConfig(BaseModel):
|
||||||
period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
|
period: QuotaPeriod = Field(default=QuotaPeriod.DAY, description="Quota period to set")
|
||||||
|
|
||||||
|
|
||||||
|
class CORSConfig(BaseModel):
|
||||||
|
allow_origins: list[str] = Field(default_factory=list)
|
||||||
|
allow_origin_regex: str | None = Field(default=None)
|
||||||
|
allow_methods: list[str] = Field(default=["OPTIONS"])
|
||||||
|
allow_headers: list[str] = Field(default_factory=list)
|
||||||
|
allow_credentials: bool = Field(default=False)
|
||||||
|
expose_headers: list[str] = Field(default_factory=list)
|
||||||
|
max_age: int = Field(default=600, ge=0)
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def validate_credentials_config(self) -> Self:
|
||||||
|
if self.allow_credentials and (self.allow_origins == ["*"] or "*" in self.allow_origins):
|
||||||
|
raise ValueError("Cannot use wildcard origins with credentials enabled")
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def process_cors_config(cors_config: bool | CORSConfig | None) -> CORSConfig | None:
|
||||||
|
if cors_config is False or cors_config is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if cors_config is True:
|
||||||
|
# dev mode: allow localhost on any port
|
||||||
|
return CORSConfig(
|
||||||
|
allow_origins=[],
|
||||||
|
allow_origin_regex=r"https?://localhost:\d+",
|
||||||
|
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
||||||
|
allow_headers=["Content-Type", "Authorization", "X-Requested-With"],
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(cors_config, CORSConfig):
|
||||||
|
return cors_config
|
||||||
|
|
||||||
|
raise ValueError(f"Expected bool or CORSConfig, got {type(cors_config).__name__}")
|
||||||
|
|
||||||
|
|
||||||
class ServerConfig(BaseModel):
|
class ServerConfig(BaseModel):
|
||||||
port: int = Field(
|
port: int = Field(
|
||||||
default=8321,
|
default=8321,
|
||||||
|
@ -349,6 +384,12 @@ class ServerConfig(BaseModel):
|
||||||
default=None,
|
default=None,
|
||||||
description="Per client quota request configuration",
|
description="Per client quota request configuration",
|
||||||
)
|
)
|
||||||
|
cors: bool | CORSConfig | None = Field(
|
||||||
|
default=None,
|
||||||
|
description="CORS configuration for cross-origin requests. Can be:\n"
|
||||||
|
"- true: Enable localhost CORS for development\n"
|
||||||
|
"- {allow_origins: [...], allow_methods: [...], ...}: Full configuration",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class StackRunConfig(BaseModel):
|
class StackRunConfig(BaseModel):
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import inspect
|
import inspect
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging # allow-direct-logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
@ -48,6 +48,7 @@ from llama_stack.core.stack import (
|
||||||
from llama_stack.core.utils.config import redact_sensitive_fields
|
from llama_stack.core.utils.config import redact_sensitive_fields
|
||||||
from llama_stack.core.utils.context import preserve_contexts_async_generator
|
from llama_stack.core.utils.context import preserve_contexts_async_generator
|
||||||
from llama_stack.core.utils.exec import in_notebook
|
from llama_stack.core.utils.exec import in_notebook
|
||||||
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.utils.telemetry.tracing import (
|
from llama_stack.providers.utils.telemetry.tracing import (
|
||||||
CURRENT_TRACE_CONTEXT,
|
CURRENT_TRACE_CONTEXT,
|
||||||
end_trace,
|
end_trace,
|
||||||
|
@ -55,7 +56,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
|
||||||
start_trace,
|
start_trace,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = get_logger(name=__name__, category="core")
|
||||||
|
|
||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
@ -145,39 +146,26 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.async_client = AsyncLlamaStackAsLibraryClient(
|
self.async_client = AsyncLlamaStackAsLibraryClient(
|
||||||
config_path_or_distro_name, custom_provider_registry, provider_data
|
config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal
|
||||||
)
|
)
|
||||||
self.pool_executor = ThreadPoolExecutor(max_workers=4)
|
self.pool_executor = ThreadPoolExecutor(max_workers=4)
|
||||||
self.skip_logger_removal = skip_logger_removal
|
|
||||||
self.provider_data = provider_data
|
self.provider_data = provider_data
|
||||||
|
|
||||||
self.loop = asyncio.new_event_loop()
|
self.loop = asyncio.new_event_loop()
|
||||||
|
|
||||||
def initialize(self):
|
|
||||||
if in_notebook():
|
|
||||||
import nest_asyncio
|
|
||||||
|
|
||||||
nest_asyncio.apply()
|
|
||||||
if not self.skip_logger_removal:
|
|
||||||
self._remove_root_logger_handlers()
|
|
||||||
|
|
||||||
# use a new event loop to avoid interfering with the main event loop
|
# use a new event loop to avoid interfering with the main event loop
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
try:
|
try:
|
||||||
return loop.run_until_complete(self.async_client.initialize())
|
loop.run_until_complete(self.async_client.initialize())
|
||||||
finally:
|
finally:
|
||||||
asyncio.set_event_loop(None)
|
asyncio.set_event_loop(None)
|
||||||
|
|
||||||
def _remove_root_logger_handlers(self):
|
def initialize(self):
|
||||||
"""
|
"""
|
||||||
Remove all handlers from the root logger. Needed to avoid polluting the console with logs.
|
Deprecated method for backward compatibility.
|
||||||
"""
|
"""
|
||||||
root_logger = logging.getLogger()
|
pass
|
||||||
|
|
||||||
for handler in root_logger.handlers[:]:
|
|
||||||
root_logger.removeHandler(handler)
|
|
||||||
logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
|
|
||||||
|
|
||||||
def request(self, *args, **kwargs):
|
def request(self, *args, **kwargs):
|
||||||
loop = self.loop
|
loop = self.loop
|
||||||
|
@ -215,6 +203,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
||||||
config_path_or_distro_name: str,
|
config_path_or_distro_name: str,
|
||||||
custom_provider_registry: ProviderRegistry | None = None,
|
custom_provider_registry: ProviderRegistry | None = None,
|
||||||
provider_data: dict[str, Any] | None = None,
|
provider_data: dict[str, Any] | None = None,
|
||||||
|
skip_logger_removal: bool = False,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# when using the library client, we should not log to console since many
|
# when using the library client, we should not log to console since many
|
||||||
|
@ -222,6 +211,13 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
||||||
current_sinks = os.environ.get("TELEMETRY_SINKS", "sqlite").split(",")
|
current_sinks = os.environ.get("TELEMETRY_SINKS", "sqlite").split(",")
|
||||||
os.environ["TELEMETRY_SINKS"] = ",".join(sink for sink in current_sinks if sink != "console")
|
os.environ["TELEMETRY_SINKS"] = ",".join(sink for sink in current_sinks if sink != "console")
|
||||||
|
|
||||||
|
if in_notebook():
|
||||||
|
import nest_asyncio
|
||||||
|
|
||||||
|
nest_asyncio.apply()
|
||||||
|
if not skip_logger_removal:
|
||||||
|
self._remove_root_logger_handlers()
|
||||||
|
|
||||||
if config_path_or_distro_name.endswith(".yaml"):
|
if config_path_or_distro_name.endswith(".yaml"):
|
||||||
config_path = Path(config_path_or_distro_name)
|
config_path = Path(config_path_or_distro_name)
|
||||||
if not config_path.exists():
|
if not config_path.exists():
|
||||||
|
@ -238,7 +234,24 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
||||||
self.provider_data = provider_data
|
self.provider_data = provider_data
|
||||||
self.route_impls: RouteImpls | None = None # Initialize to None to prevent AttributeError
|
self.route_impls: RouteImpls | None = None # Initialize to None to prevent AttributeError
|
||||||
|
|
||||||
|
def _remove_root_logger_handlers(self):
|
||||||
|
"""
|
||||||
|
Remove all handlers from the root logger. Needed to avoid polluting the console with logs.
|
||||||
|
"""
|
||||||
|
root_logger = logging.getLogger()
|
||||||
|
|
||||||
|
for handler in root_logger.handlers[:]:
|
||||||
|
root_logger.removeHandler(handler)
|
||||||
|
logger.info(f"Removed handler {handler.__class__.__name__} from root logger")
|
||||||
|
|
||||||
async def initialize(self) -> bool:
|
async def initialize(self) -> bool:
|
||||||
|
"""
|
||||||
|
Initialize the async client.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if initialization was successful
|
||||||
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.route_impls = None
|
self.route_impls = None
|
||||||
self.impls = await construct_stack(self.config, self.custom_provider_registry)
|
self.impls = await construct_stack(self.config, self.custom_provider_registry)
|
||||||
|
|
|
@ -6,15 +6,15 @@
|
||||||
|
|
||||||
import contextvars
|
import contextvars
|
||||||
import json
|
import json
|
||||||
import logging
|
|
||||||
from contextlib import AbstractContextManager
|
from contextlib import AbstractContextManager
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from llama_stack.core.datatypes import User
|
from llama_stack.core.datatypes import User
|
||||||
|
from llama_stack.log import get_logger
|
||||||
|
|
||||||
from .utils.dynamic import instantiate_class_type
|
from .utils.dynamic import instantiate_class_type
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = get_logger(name=__name__, category="core")
|
||||||
|
|
||||||
# Context variable for request provider data and auth attributes
|
# Context variable for request provider data and auth attributes
|
||||||
PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
|
PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
|
||||||
|
|
|
@ -8,6 +8,7 @@ import inspect
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from llama_stack.apis.agents import Agents
|
from llama_stack.apis.agents import Agents
|
||||||
|
from llama_stack.apis.batches import Batches
|
||||||
from llama_stack.apis.benchmarks import Benchmarks
|
from llama_stack.apis.benchmarks import Benchmarks
|
||||||
from llama_stack.apis.datasetio import DatasetIO
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Datasets
|
from llama_stack.apis.datasets import Datasets
|
||||||
|
@ -75,6 +76,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
|
||||||
Api.agents: Agents,
|
Api.agents: Agents,
|
||||||
Api.inference: Inference,
|
Api.inference: Inference,
|
||||||
Api.inspect: Inspect,
|
Api.inspect: Inspect,
|
||||||
|
Api.batches: Batches,
|
||||||
Api.vector_io: VectorIO,
|
Api.vector_io: VectorIO,
|
||||||
Api.vector_dbs: VectorDBs,
|
Api.vector_dbs: VectorDBs,
|
||||||
Api.models: Models,
|
Api.models: Models,
|
||||||
|
|
|
@ -12,7 +12,7 @@ from llama_stack.apis.datasets import DatasetPurpose, DataSource
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import RoutingTable
|
from llama_stack.providers.datatypes import RoutingTable
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routers")
|
||||||
|
|
||||||
|
|
||||||
class DatasetIORouter(DatasetIO):
|
class DatasetIORouter(DatasetIO):
|
||||||
|
|
|
@ -16,7 +16,7 @@ from llama_stack.apis.scoring import (
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import RoutingTable
|
from llama_stack.providers.datatypes import RoutingTable
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routers")
|
||||||
|
|
||||||
|
|
||||||
class ScoringRouter(Scoring):
|
class ScoringRouter(Scoring):
|
||||||
|
|
|
@ -65,7 +65,7 @@ from llama_stack.providers.datatypes import HealthResponse, HealthStatus, Routin
|
||||||
from llama_stack.providers.utils.inference.inference_store import InferenceStore
|
from llama_stack.providers.utils.inference.inference_store import InferenceStore
|
||||||
from llama_stack.providers.utils.telemetry.tracing import get_current_span
|
from llama_stack.providers.utils.telemetry.tracing import get_current_span
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="inference")
|
logger = get_logger(name=__name__, category="core::routers")
|
||||||
|
|
||||||
|
|
||||||
class InferenceRouter(Inference):
|
class InferenceRouter(Inference):
|
||||||
|
|
|
@ -6,16 +6,14 @@
|
||||||
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from llama_stack.apis.inference import (
|
from llama_stack.apis.inference import Message
|
||||||
Message,
|
|
||||||
)
|
|
||||||
from llama_stack.apis.safety import RunShieldResponse, Safety
|
from llama_stack.apis.safety import RunShieldResponse, Safety
|
||||||
from llama_stack.apis.safety.safety import ModerationObject
|
from llama_stack.apis.safety.safety import ModerationObject
|
||||||
from llama_stack.apis.shields import Shield
|
from llama_stack.apis.shields import Shield
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import RoutingTable
|
from llama_stack.providers.datatypes import RoutingTable
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routers")
|
||||||
|
|
||||||
|
|
||||||
class SafetyRouter(Safety):
|
class SafetyRouter(Safety):
|
||||||
|
@ -68,6 +66,7 @@ class SafetyRouter(Safety):
|
||||||
list_shields_response = await self.routing_table.list_shields()
|
list_shields_response = await self.routing_table.list_shields()
|
||||||
|
|
||||||
matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id]
|
matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id]
|
||||||
|
|
||||||
if not matches:
|
if not matches:
|
||||||
raise ValueError(f"No shield associated with provider_resource id {model}")
|
raise ValueError(f"No shield associated with provider_resource id {model}")
|
||||||
if len(matches) > 1:
|
if len(matches) > 1:
|
||||||
|
|
|
@ -22,7 +22,7 @@ from llama_stack.log import get_logger
|
||||||
|
|
||||||
from ..routing_tables.toolgroups import ToolGroupsRoutingTable
|
from ..routing_tables.toolgroups import ToolGroupsRoutingTable
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routers")
|
||||||
|
|
||||||
|
|
||||||
class ToolRuntimeRouter(ToolRuntime):
|
class ToolRuntimeRouter(ToolRuntime):
|
||||||
|
|
|
@ -30,7 +30,7 @@ from llama_stack.apis.vector_io import (
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
|
from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routers")
|
||||||
|
|
||||||
|
|
||||||
class VectorIORouter(VectorIO):
|
class VectorIORouter(VectorIO):
|
||||||
|
|
|
@ -14,7 +14,7 @@ from llama_stack.log import get_logger
|
||||||
|
|
||||||
from .common import CommonRoutingTableImpl
|
from .common import CommonRoutingTableImpl
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routing_tables")
|
||||||
|
|
||||||
|
|
||||||
class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
|
||||||
|
|
|
@ -23,7 +23,7 @@ from llama_stack.core.store import DistributionRegistry
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import Api, RoutingTable
|
from llama_stack.providers.datatypes import Api, RoutingTable
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routing_tables")
|
||||||
|
|
||||||
|
|
||||||
def get_impl_api(p: Any) -> Api:
|
def get_impl_api(p: Any) -> Api:
|
||||||
|
|
|
@ -26,7 +26,7 @@ from llama_stack.log import get_logger
|
||||||
|
|
||||||
from .common import CommonRoutingTableImpl
|
from .common import CommonRoutingTableImpl
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routing_tables")
|
||||||
|
|
||||||
|
|
||||||
class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
|
class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
|
||||||
|
|
|
@ -17,7 +17,7 @@ from llama_stack.log import get_logger
|
||||||
|
|
||||||
from .common import CommonRoutingTableImpl, lookup_model
|
from .common import CommonRoutingTableImpl, lookup_model
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routing_tables")
|
||||||
|
|
||||||
|
|
||||||
class ModelsRoutingTable(CommonRoutingTableImpl, Models):
|
class ModelsRoutingTable(CommonRoutingTableImpl, Models):
|
||||||
|
|
|
@ -19,7 +19,7 @@ from llama_stack.log import get_logger
|
||||||
|
|
||||||
from .common import CommonRoutingTableImpl
|
from .common import CommonRoutingTableImpl
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routing_tables")
|
||||||
|
|
||||||
|
|
||||||
class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
|
class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
|
||||||
|
|
|
@ -15,7 +15,7 @@ from llama_stack.log import get_logger
|
||||||
|
|
||||||
from .common import CommonRoutingTableImpl
|
from .common import CommonRoutingTableImpl
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routing_tables")
|
||||||
|
|
||||||
|
|
||||||
class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
|
class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
|
||||||
|
|
|
@ -14,7 +14,7 @@ from llama_stack.log import get_logger
|
||||||
|
|
||||||
from .common import CommonRoutingTableImpl
|
from .common import CommonRoutingTableImpl
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routing_tables")
|
||||||
|
|
||||||
|
|
||||||
def parse_toolgroup_from_toolgroup_name_pair(toolgroup_name_with_maybe_tool_name: str) -> str | None:
|
def parse_toolgroup_from_toolgroup_name_pair(toolgroup_name_with_maybe_tool_name: str) -> str | None:
|
||||||
|
|
|
@ -30,7 +30,7 @@ from llama_stack.log import get_logger
|
||||||
|
|
||||||
from .common import CommonRoutingTableImpl, lookup_model
|
from .common import CommonRoutingTableImpl, lookup_model
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = get_logger(name=__name__, category="core::routing_tables")
|
||||||
|
|
||||||
|
|
||||||
class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
|
class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
|
||||||
|
|
|
@ -15,7 +15,7 @@ from llama_stack.core.server.auth_providers import create_auth_provider
|
||||||
from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
|
from llama_stack.core.server.routes import find_matching_route, initialize_route_impls
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="auth")
|
logger = get_logger(name=__name__, category="core::auth")
|
||||||
|
|
||||||
|
|
||||||
class AuthenticationMiddleware:
|
class AuthenticationMiddleware:
|
||||||
|
|
|
@ -23,7 +23,7 @@ from llama_stack.core.datatypes import (
|
||||||
)
|
)
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="auth")
|
logger = get_logger(name=__name__, category="core::auth")
|
||||||
|
|
||||||
|
|
||||||
class AuthResponse(BaseModel):
|
class AuthResponse(BaseModel):
|
||||||
|
|
|
@ -15,7 +15,7 @@ from llama_stack.providers.utils.kvstore.api import KVStore
|
||||||
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
|
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
|
||||||
from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
|
from llama_stack.providers.utils.kvstore.kvstore import kvstore_impl
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="quota")
|
logger = get_logger(name=__name__, category="core::server")
|
||||||
|
|
||||||
|
|
||||||
class QuotaMiddleware:
|
class QuotaMiddleware:
|
||||||
|
|
|
@ -9,7 +9,7 @@ import asyncio
|
||||||
import functools
|
import functools
|
||||||
import inspect
|
import inspect
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging # allow-direct-logging
|
||||||
import os
|
import os
|
||||||
import ssl
|
import ssl
|
||||||
import sys
|
import sys
|
||||||
|
@ -28,10 +28,12 @@ from aiohttp import hdrs
|
||||||
from fastapi import Body, FastAPI, HTTPException, Request, Response
|
from fastapi import Body, FastAPI, HTTPException, Request, Response
|
||||||
from fastapi import Path as FastapiPath
|
from fastapi import Path as FastapiPath
|
||||||
from fastapi.exceptions import RequestValidationError
|
from fastapi.exceptions import RequestValidationError
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from openai import BadRequestError
|
from openai import BadRequestError
|
||||||
from pydantic import BaseModel, ValidationError
|
from pydantic import BaseModel, ValidationError
|
||||||
|
|
||||||
|
from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
|
||||||
from llama_stack.apis.common.responses import PaginatedResponse
|
from llama_stack.apis.common.responses import PaginatedResponse
|
||||||
from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
|
from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
|
||||||
from llama_stack.core.access_control.access_control import AccessDeniedError
|
from llama_stack.core.access_control.access_control import AccessDeniedError
|
||||||
|
@ -39,6 +41,7 @@ from llama_stack.core.datatypes import (
|
||||||
AuthenticationRequiredError,
|
AuthenticationRequiredError,
|
||||||
LoggingConfig,
|
LoggingConfig,
|
||||||
StackRunConfig,
|
StackRunConfig,
|
||||||
|
process_cors_config,
|
||||||
)
|
)
|
||||||
from llama_stack.core.distribution import builtin_automatically_routed_apis
|
from llama_stack.core.distribution import builtin_automatically_routed_apis
|
||||||
from llama_stack.core.external import ExternalApiSpec, load_external_apis
|
from llama_stack.core.external import ExternalApiSpec, load_external_apis
|
||||||
|
@ -81,7 +84,7 @@ from .quota import QuotaMiddleware
|
||||||
|
|
||||||
REPO_ROOT = Path(__file__).parent.parent.parent.parent
|
REPO_ROOT = Path(__file__).parent.parent.parent.parent
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="server")
|
logger = get_logger(name=__name__, category="core::server")
|
||||||
|
|
||||||
|
|
||||||
def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
|
def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
|
||||||
|
@ -128,6 +131,10 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
elif isinstance(exc, ConflictError):
|
||||||
|
return HTTPException(status_code=409, detail=str(exc))
|
||||||
|
elif isinstance(exc, ResourceNotFoundError):
|
||||||
|
return HTTPException(status_code=404, detail=str(exc))
|
||||||
elif isinstance(exc, ValueError):
|
elif isinstance(exc, ValueError):
|
||||||
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
|
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
|
||||||
elif isinstance(exc, BadRequestError):
|
elif isinstance(exc, BadRequestError):
|
||||||
|
@ -408,7 +415,7 @@ def main(args: argparse.Namespace | None = None):
|
||||||
config_contents = yaml.safe_load(fp)
|
config_contents = yaml.safe_load(fp)
|
||||||
if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
|
if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
|
||||||
logger_config = LoggingConfig(**cfg)
|
logger_config = LoggingConfig(**cfg)
|
||||||
logger = get_logger(name=__name__, category="server", config=logger_config)
|
logger = get_logger(name=__name__, category="core::server", config=logger_config)
|
||||||
if args.env:
|
if args.env:
|
||||||
for env_pair in args.env:
|
for env_pair in args.env:
|
||||||
try:
|
try:
|
||||||
|
@ -478,6 +485,12 @@ def main(args: argparse.Namespace | None = None):
|
||||||
window_seconds=window_seconds,
|
window_seconds=window_seconds,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if config.server.cors:
|
||||||
|
logger.info("Enabling CORS")
|
||||||
|
cors_config = process_cors_config(config.server.cors)
|
||||||
|
if cors_config:
|
||||||
|
app.add_middleware(CORSMiddleware, **cors_config.model_dump())
|
||||||
|
|
||||||
if Api.telemetry in impls:
|
if Api.telemetry in impls:
|
||||||
setup_logger(impls[Api.telemetry])
|
setup_logger(impls[Api.telemetry])
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -16,7 +16,7 @@ from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
|
from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
|
||||||
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
|
from llama_stack.providers.utils.kvstore.config import KVStoreConfig, SqliteKVStoreConfig
|
||||||
|
|
||||||
logger = get_logger(__name__, category="core")
|
logger = get_logger(__name__, category="core::registry")
|
||||||
|
|
||||||
|
|
||||||
class DistributionRegistry(Protocol):
|
class DistributionRegistry(Protocol):
|
||||||
|
|
|
@ -10,7 +10,7 @@ from pathlib import Path
|
||||||
from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
|
from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="config_resolution")
|
logger = get_logger(name=__name__, category="core")
|
||||||
|
|
||||||
|
|
||||||
DISTRO_DIR = Path(__file__).parent.parent.parent.parent / "llama_stack" / "distributions"
|
DISTRO_DIR = Path(__file__).parent.parent.parent.parent / "llama_stack" / "distributions"
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import logging
|
import importlib
|
||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
|
@ -12,9 +12,9 @@ import sys
|
||||||
|
|
||||||
from termcolor import cprint
|
from termcolor import cprint
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
from llama_stack.log import get_logger
|
||||||
|
|
||||||
import importlib
|
log = get_logger(name=__name__, category="core")
|
||||||
|
|
||||||
|
|
||||||
def formulate_run_args(image_type: str, image_name: str) -> list:
|
def formulate_run_args(image_type: str, image_name: str) -> list:
|
||||||
|
|
|
@ -6,7 +6,6 @@
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
import json
|
import json
|
||||||
import logging
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Annotated, Any, Literal, Union, get_args, get_origin
|
from typing import Annotated, Any, Literal, Union, get_args, get_origin
|
||||||
|
|
||||||
|
@ -14,7 +13,9 @@ from pydantic import BaseModel
|
||||||
from pydantic.fields import FieldInfo
|
from pydantic.fields import FieldInfo
|
||||||
from pydantic_core import PydanticUndefinedType
|
from pydantic_core import PydanticUndefinedType
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
from llama_stack.log import get_logger
|
||||||
|
|
||||||
|
log = get_logger(name=__name__, category="core")
|
||||||
|
|
||||||
|
|
||||||
def is_list_of_primitives(field_type):
|
def is_list_of_primitives(field_type):
|
||||||
|
|
|
@ -28,12 +28,13 @@ distribution_spec:
|
||||||
- provider_type: inline::localfs
|
- provider_type: inline::localfs
|
||||||
safety:
|
safety:
|
||||||
- provider_type: inline::llama-guard
|
- provider_type: inline::llama-guard
|
||||||
|
- provider_type: inline::code-scanner
|
||||||
agents:
|
agents:
|
||||||
- provider_type: inline::meta-reference
|
- provider_type: inline::meta-reference
|
||||||
telemetry:
|
telemetry:
|
||||||
- provider_type: inline::meta-reference
|
- provider_type: inline::meta-reference
|
||||||
post_training:
|
post_training:
|
||||||
- provider_type: inline::huggingface
|
- provider_type: inline::huggingface-cpu
|
||||||
eval:
|
eval:
|
||||||
- provider_type: inline::meta-reference
|
- provider_type: inline::meta-reference
|
||||||
datasetio:
|
datasetio:
|
||||||
|
@ -48,6 +49,8 @@ distribution_spec:
|
||||||
- provider_type: remote::tavily-search
|
- provider_type: remote::tavily-search
|
||||||
- provider_type: inline::rag-runtime
|
- provider_type: inline::rag-runtime
|
||||||
- provider_type: remote::model-context-protocol
|
- provider_type: remote::model-context-protocol
|
||||||
|
batches:
|
||||||
|
- provider_type: inline::reference
|
||||||
image_type: venv
|
image_type: venv
|
||||||
additional_pip_packages:
|
additional_pip_packages:
|
||||||
- aiosqlite
|
- aiosqlite
|
||||||
|
|
|
@ -2,6 +2,7 @@ version: 2
|
||||||
image_name: ci-tests
|
image_name: ci-tests
|
||||||
apis:
|
apis:
|
||||||
- agents
|
- agents
|
||||||
|
- batches
|
||||||
- datasetio
|
- datasetio
|
||||||
- eval
|
- eval
|
||||||
- files
|
- files
|
||||||
|
@ -134,6 +135,8 @@ providers:
|
||||||
provider_type: inline::llama-guard
|
provider_type: inline::llama-guard
|
||||||
config:
|
config:
|
||||||
excluded_categories: []
|
excluded_categories: []
|
||||||
|
- provider_id: code-scanner
|
||||||
|
provider_type: inline::code-scanner
|
||||||
agents:
|
agents:
|
||||||
- provider_id: meta-reference
|
- provider_id: meta-reference
|
||||||
provider_type: inline::meta-reference
|
provider_type: inline::meta-reference
|
||||||
|
@ -153,8 +156,8 @@ providers:
|
||||||
sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db
|
sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db
|
||||||
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
|
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
|
||||||
post_training:
|
post_training:
|
||||||
- provider_id: huggingface
|
- provider_id: huggingface-cpu
|
||||||
provider_type: inline::huggingface
|
provider_type: inline::huggingface-cpu
|
||||||
config:
|
config:
|
||||||
checkpoint_format: huggingface
|
checkpoint_format: huggingface
|
||||||
distributed_backend: null
|
distributed_backend: null
|
||||||
|
@ -204,6 +207,13 @@ providers:
|
||||||
provider_type: inline::rag-runtime
|
provider_type: inline::rag-runtime
|
||||||
- provider_id: model-context-protocol
|
- provider_id: model-context-protocol
|
||||||
provider_type: remote::model-context-protocol
|
provider_type: remote::model-context-protocol
|
||||||
|
batches:
|
||||||
|
- provider_id: reference
|
||||||
|
provider_type: inline::reference
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/batches.db
|
||||||
metadata_store:
|
metadata_store:
|
||||||
type: sqlite
|
type: sqlite
|
||||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/registry.db
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/registry.db
|
||||||
|
@ -215,6 +225,9 @@ shields:
|
||||||
- shield_id: llama-guard
|
- shield_id: llama-guard
|
||||||
provider_id: ${env.SAFETY_MODEL:+llama-guard}
|
provider_id: ${env.SAFETY_MODEL:+llama-guard}
|
||||||
provider_shield_id: ${env.SAFETY_MODEL:=}
|
provider_shield_id: ${env.SAFETY_MODEL:=}
|
||||||
|
- shield_id: code-scanner
|
||||||
|
provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
|
||||||
|
provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
|
||||||
vector_dbs: []
|
vector_dbs: []
|
||||||
datasets: []
|
datasets: []
|
||||||
scoring_fns: []
|
scoring_fns: []
|
||||||
|
|
7
llama_stack/distributions/starter-gpu/__init__.py
Normal file
7
llama_stack/distributions/starter-gpu/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from .starter_gpu import get_distribution_template # noqa: F401
|
59
llama_stack/distributions/starter-gpu/build.yaml
Normal file
59
llama_stack/distributions/starter-gpu/build.yaml
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
version: 2
|
||||||
|
distribution_spec:
|
||||||
|
description: Quick start template for running Llama Stack with several popular providers.
|
||||||
|
This distribution is intended for GPU-enabled environments.
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_type: remote::cerebras
|
||||||
|
- provider_type: remote::ollama
|
||||||
|
- provider_type: remote::vllm
|
||||||
|
- provider_type: remote::tgi
|
||||||
|
- provider_type: remote::fireworks
|
||||||
|
- provider_type: remote::together
|
||||||
|
- provider_type: remote::bedrock
|
||||||
|
- provider_type: remote::nvidia
|
||||||
|
- provider_type: remote::openai
|
||||||
|
- provider_type: remote::anthropic
|
||||||
|
- provider_type: remote::gemini
|
||||||
|
- provider_type: remote::vertexai
|
||||||
|
- provider_type: remote::groq
|
||||||
|
- provider_type: remote::sambanova
|
||||||
|
- provider_type: inline::sentence-transformers
|
||||||
|
vector_io:
|
||||||
|
- provider_type: inline::faiss
|
||||||
|
- provider_type: inline::sqlite-vec
|
||||||
|
- provider_type: inline::milvus
|
||||||
|
- provider_type: remote::chromadb
|
||||||
|
- provider_type: remote::pgvector
|
||||||
|
files:
|
||||||
|
- provider_type: inline::localfs
|
||||||
|
safety:
|
||||||
|
- provider_type: inline::llama-guard
|
||||||
|
- provider_type: inline::code-scanner
|
||||||
|
agents:
|
||||||
|
- provider_type: inline::meta-reference
|
||||||
|
telemetry:
|
||||||
|
- provider_type: inline::meta-reference
|
||||||
|
post_training:
|
||||||
|
- provider_type: inline::torchtune-gpu
|
||||||
|
eval:
|
||||||
|
- provider_type: inline::meta-reference
|
||||||
|
datasetio:
|
||||||
|
- provider_type: remote::huggingface
|
||||||
|
- provider_type: inline::localfs
|
||||||
|
scoring:
|
||||||
|
- provider_type: inline::basic
|
||||||
|
- provider_type: inline::llm-as-judge
|
||||||
|
- provider_type: inline::braintrust
|
||||||
|
tool_runtime:
|
||||||
|
- provider_type: remote::brave-search
|
||||||
|
- provider_type: remote::tavily-search
|
||||||
|
- provider_type: inline::rag-runtime
|
||||||
|
- provider_type: remote::model-context-protocol
|
||||||
|
batches:
|
||||||
|
- provider_type: inline::reference
|
||||||
|
image_type: venv
|
||||||
|
additional_pip_packages:
|
||||||
|
- aiosqlite
|
||||||
|
- asyncpg
|
||||||
|
- sqlalchemy[asyncio]
|
238
llama_stack/distributions/starter-gpu/run.yaml
Normal file
238
llama_stack/distributions/starter-gpu/run.yaml
Normal file
|
@ -0,0 +1,238 @@
|
||||||
|
version: 2
|
||||||
|
image_name: starter-gpu
|
||||||
|
apis:
|
||||||
|
- agents
|
||||||
|
- batches
|
||||||
|
- datasetio
|
||||||
|
- eval
|
||||||
|
- files
|
||||||
|
- inference
|
||||||
|
- post_training
|
||||||
|
- safety
|
||||||
|
- scoring
|
||||||
|
- telemetry
|
||||||
|
- tool_runtime
|
||||||
|
- vector_io
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
|
||||||
|
provider_type: remote::cerebras
|
||||||
|
config:
|
||||||
|
base_url: https://api.cerebras.ai
|
||||||
|
api_key: ${env.CEREBRAS_API_KEY:=}
|
||||||
|
- provider_id: ${env.OLLAMA_URL:+ollama}
|
||||||
|
provider_type: remote::ollama
|
||||||
|
config:
|
||||||
|
url: ${env.OLLAMA_URL:=http://localhost:11434}
|
||||||
|
- provider_id: ${env.VLLM_URL:+vllm}
|
||||||
|
provider_type: remote::vllm
|
||||||
|
config:
|
||||||
|
url: ${env.VLLM_URL:=}
|
||||||
|
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
||||||
|
api_token: ${env.VLLM_API_TOKEN:=fake}
|
||||||
|
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
||||||
|
- provider_id: ${env.TGI_URL:+tgi}
|
||||||
|
provider_type: remote::tgi
|
||||||
|
config:
|
||||||
|
url: ${env.TGI_URL:=}
|
||||||
|
- provider_id: fireworks
|
||||||
|
provider_type: remote::fireworks
|
||||||
|
config:
|
||||||
|
url: https://api.fireworks.ai/inference/v1
|
||||||
|
api_key: ${env.FIREWORKS_API_KEY:=}
|
||||||
|
- provider_id: together
|
||||||
|
provider_type: remote::together
|
||||||
|
config:
|
||||||
|
url: https://api.together.xyz/v1
|
||||||
|
api_key: ${env.TOGETHER_API_KEY:=}
|
||||||
|
- provider_id: bedrock
|
||||||
|
provider_type: remote::bedrock
|
||||||
|
- provider_id: ${env.NVIDIA_API_KEY:+nvidia}
|
||||||
|
provider_type: remote::nvidia
|
||||||
|
config:
|
||||||
|
url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
|
||||||
|
api_key: ${env.NVIDIA_API_KEY:=}
|
||||||
|
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
|
||||||
|
- provider_id: openai
|
||||||
|
provider_type: remote::openai
|
||||||
|
config:
|
||||||
|
api_key: ${env.OPENAI_API_KEY:=}
|
||||||
|
base_url: ${env.OPENAI_BASE_URL:=https://api.openai.com/v1}
|
||||||
|
- provider_id: anthropic
|
||||||
|
provider_type: remote::anthropic
|
||||||
|
config:
|
||||||
|
api_key: ${env.ANTHROPIC_API_KEY:=}
|
||||||
|
- provider_id: gemini
|
||||||
|
provider_type: remote::gemini
|
||||||
|
config:
|
||||||
|
api_key: ${env.GEMINI_API_KEY:=}
|
||||||
|
- provider_id: ${env.VERTEX_AI_PROJECT:+vertexai}
|
||||||
|
provider_type: remote::vertexai
|
||||||
|
config:
|
||||||
|
project: ${env.VERTEX_AI_PROJECT:=}
|
||||||
|
location: ${env.VERTEX_AI_LOCATION:=us-central1}
|
||||||
|
- provider_id: groq
|
||||||
|
provider_type: remote::groq
|
||||||
|
config:
|
||||||
|
url: https://api.groq.com
|
||||||
|
api_key: ${env.GROQ_API_KEY:=}
|
||||||
|
- provider_id: sambanova
|
||||||
|
provider_type: remote::sambanova
|
||||||
|
config:
|
||||||
|
url: https://api.sambanova.ai/v1
|
||||||
|
api_key: ${env.SAMBANOVA_API_KEY:=}
|
||||||
|
- provider_id: sentence-transformers
|
||||||
|
provider_type: inline::sentence-transformers
|
||||||
|
vector_io:
|
||||||
|
- provider_id: faiss
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
|
||||||
|
- provider_id: sqlite-vec
|
||||||
|
provider_type: inline::sqlite-vec
|
||||||
|
config:
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
|
||||||
|
- provider_id: ${env.MILVUS_URL:+milvus}
|
||||||
|
provider_type: inline::milvus
|
||||||
|
config:
|
||||||
|
db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
|
||||||
|
- provider_id: ${env.CHROMADB_URL:+chromadb}
|
||||||
|
provider_type: remote::chromadb
|
||||||
|
config:
|
||||||
|
url: ${env.CHROMADB_URL:=}
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
|
||||||
|
- provider_id: ${env.PGVECTOR_DB:+pgvector}
|
||||||
|
provider_type: remote::pgvector
|
||||||
|
config:
|
||||||
|
host: ${env.PGVECTOR_HOST:=localhost}
|
||||||
|
port: ${env.PGVECTOR_PORT:=5432}
|
||||||
|
db: ${env.PGVECTOR_DB:=}
|
||||||
|
user: ${env.PGVECTOR_USER:=}
|
||||||
|
password: ${env.PGVECTOR_PASSWORD:=}
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
|
||||||
|
files:
|
||||||
|
- provider_id: meta-reference-files
|
||||||
|
provider_type: inline::localfs
|
||||||
|
config:
|
||||||
|
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
|
||||||
|
metadata_store:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
|
||||||
|
safety:
|
||||||
|
- provider_id: llama-guard
|
||||||
|
provider_type: inline::llama-guard
|
||||||
|
config:
|
||||||
|
excluded_categories: []
|
||||||
|
- provider_id: code-scanner
|
||||||
|
provider_type: inline::code-scanner
|
||||||
|
agents:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
persistence_store:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/agents_store.db
|
||||||
|
responses_store:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/responses_store.db
|
||||||
|
telemetry:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||||
|
sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
|
||||||
|
sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/trace_store.db
|
||||||
|
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
|
||||||
|
post_training:
|
||||||
|
- provider_id: torchtune-gpu
|
||||||
|
provider_type: inline::torchtune-gpu
|
||||||
|
config:
|
||||||
|
checkpoint_format: meta
|
||||||
|
eval:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/meta_reference_eval.db
|
||||||
|
datasetio:
|
||||||
|
- provider_id: huggingface
|
||||||
|
provider_type: remote::huggingface
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/huggingface_datasetio.db
|
||||||
|
- provider_id: localfs
|
||||||
|
provider_type: inline::localfs
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/localfs_datasetio.db
|
||||||
|
scoring:
|
||||||
|
- provider_id: basic
|
||||||
|
provider_type: inline::basic
|
||||||
|
- provider_id: llm-as-judge
|
||||||
|
provider_type: inline::llm-as-judge
|
||||||
|
- provider_id: braintrust
|
||||||
|
provider_type: inline::braintrust
|
||||||
|
config:
|
||||||
|
openai_api_key: ${env.OPENAI_API_KEY:=}
|
||||||
|
tool_runtime:
|
||||||
|
- provider_id: brave-search
|
||||||
|
provider_type: remote::brave-search
|
||||||
|
config:
|
||||||
|
api_key: ${env.BRAVE_SEARCH_API_KEY:=}
|
||||||
|
max_results: 3
|
||||||
|
- provider_id: tavily-search
|
||||||
|
provider_type: remote::tavily-search
|
||||||
|
config:
|
||||||
|
api_key: ${env.TAVILY_SEARCH_API_KEY:=}
|
||||||
|
max_results: 3
|
||||||
|
- provider_id: rag-runtime
|
||||||
|
provider_type: inline::rag-runtime
|
||||||
|
- provider_id: model-context-protocol
|
||||||
|
provider_type: remote::model-context-protocol
|
||||||
|
batches:
|
||||||
|
- provider_id: reference
|
||||||
|
provider_type: inline::reference
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/batches.db
|
||||||
|
metadata_store:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/registry.db
|
||||||
|
inference_store:
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/inference_store.db
|
||||||
|
models: []
|
||||||
|
shields:
|
||||||
|
- shield_id: llama-guard
|
||||||
|
provider_id: ${env.SAFETY_MODEL:+llama-guard}
|
||||||
|
provider_shield_id: ${env.SAFETY_MODEL:=}
|
||||||
|
- shield_id: code-scanner
|
||||||
|
provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
|
||||||
|
provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
|
||||||
|
vector_dbs: []
|
||||||
|
datasets: []
|
||||||
|
scoring_fns: []
|
||||||
|
benchmarks: []
|
||||||
|
tool_groups:
|
||||||
|
- toolgroup_id: builtin::websearch
|
||||||
|
provider_id: tavily-search
|
||||||
|
- toolgroup_id: builtin::rag
|
||||||
|
provider_id: rag-runtime
|
||||||
|
server:
|
||||||
|
port: 8321
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue