mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-21 17:33:12 +00:00
Compare commits
1 commit
main
...
v0.2.17rc1
Author | SHA1 | Date | |
---|---|---|---|
|
e136739a7f |
386 changed files with 22764 additions and 24955 deletions
2
.github/TRIAGERS.md
vendored
2
.github/TRIAGERS.md
vendored
|
@ -1,2 +1,2 @@
|
||||||
# This file documents Triage members in the Llama Stack community
|
# This file documents Triage members in the Llama Stack community
|
||||||
@franciscojavierarceo
|
@bbrowning @franciscojavierarceo @leseb
|
||||||
|
|
22
.github/actions/run-and-record-tests/action.yml
vendored
22
.github/actions/run-and-record-tests/action.yml
vendored
|
@ -2,13 +2,9 @@ name: 'Run and Record Tests'
|
||||||
description: 'Run integration tests and handle recording/artifact upload'
|
description: 'Run integration tests and handle recording/artifact upload'
|
||||||
|
|
||||||
inputs:
|
inputs:
|
||||||
test-subdirs:
|
test-types:
|
||||||
description: 'Comma-separated list of test subdirectories to run'
|
description: 'JSON array of test types to run'
|
||||||
required: true
|
required: true
|
||||||
test-pattern:
|
|
||||||
description: 'Regex pattern to pass to pytest -k'
|
|
||||||
required: false
|
|
||||||
default: ''
|
|
||||||
stack-config:
|
stack-config:
|
||||||
description: 'Stack configuration to use'
|
description: 'Stack configuration to use'
|
||||||
required: true
|
required: true
|
||||||
|
@ -36,14 +32,12 @@ runs:
|
||||||
- name: Run Integration Tests
|
- name: Run Integration Tests
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
uv run --no-sync ./scripts/integration-tests.sh \
|
./scripts/integration-tests.sh \
|
||||||
--stack-config '${{ inputs.stack-config }}' \
|
--stack-config '${{ inputs.stack-config }}' \
|
||||||
--provider '${{ inputs.provider }}' \
|
--provider '${{ inputs.provider }}' \
|
||||||
--test-subdirs '${{ inputs.test-subdirs }}' \
|
--test-types '${{ inputs.test-types }}' \
|
||||||
--test-pattern '${{ inputs.test-pattern }}' \
|
|
||||||
--inference-mode '${{ inputs.inference-mode }}' \
|
--inference-mode '${{ inputs.inference-mode }}' \
|
||||||
${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
|
${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }}
|
||||||
| tee pytest-${{ inputs.inference-mode }}.log
|
|
||||||
|
|
||||||
|
|
||||||
- name: Commit and push recordings
|
- name: Commit and push recordings
|
||||||
|
@ -63,10 +57,10 @@ runs:
|
||||||
git commit -m "Recordings update from CI"
|
git commit -m "Recordings update from CI"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
git fetch origin ${{ github.ref_name }}
|
git fetch origin ${{ github.event.pull_request.head.ref }}
|
||||||
git rebase origin/${{ github.ref_name }}
|
git rebase origin/${{ github.event.pull_request.head.ref }}
|
||||||
echo "Rebased successfully"
|
echo "Rebased successfully"
|
||||||
git push origin HEAD:${{ github.ref_name }}
|
git push origin HEAD:${{ github.event.pull_request.head.ref }}
|
||||||
echo "Pushed successfully"
|
echo "Pushed successfully"
|
||||||
else
|
else
|
||||||
echo "No recording changes"
|
echo "No recording changes"
|
||||||
|
|
11
.github/actions/setup-runner/action.yml
vendored
11
.github/actions/setup-runner/action.yml
vendored
|
@ -16,21 +16,19 @@ runs:
|
||||||
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
|
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
|
||||||
with:
|
with:
|
||||||
python-version: ${{ inputs.python-version }}
|
python-version: ${{ inputs.python-version }}
|
||||||
|
activate-environment: true
|
||||||
version: 0.7.6
|
version: 0.7.6
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
echo "Updating project dependencies via uv sync"
|
|
||||||
uv sync --all-groups
|
uv sync --all-groups
|
||||||
|
uv pip install ollama faiss-cpu
|
||||||
echo "Installing ad-hoc dependencies"
|
|
||||||
uv pip install faiss-cpu
|
|
||||||
|
|
||||||
# Install llama-stack-client-python based on the client-version input
|
# Install llama-stack-client-python based on the client-version input
|
||||||
if [ "${{ inputs.client-version }}" = "latest" ]; then
|
if [ "${{ inputs.client-version }}" = "latest" ]; then
|
||||||
echo "Installing latest llama-stack-client-python from main branch"
|
echo "Installing latest llama-stack-client-python from main branch"
|
||||||
uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
|
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
|
||||||
elif [ "${{ inputs.client-version }}" = "published" ]; then
|
elif [ "${{ inputs.client-version }}" = "published" ]; then
|
||||||
echo "Installing published llama-stack-client-python from PyPI"
|
echo "Installing published llama-stack-client-python from PyPI"
|
||||||
uv pip install llama-stack-client
|
uv pip install llama-stack-client
|
||||||
|
@ -39,5 +37,4 @@ runs:
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Installed llama packages"
|
uv pip install -e .
|
||||||
uv pip list | grep llama
|
|
||||||
|
|
|
@ -42,22 +42,7 @@ runs:
|
||||||
- name: Build Llama Stack
|
- name: Build Llama Stack
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
# Install llama-stack-client-python based on the client-version input
|
uv run llama stack build --template ci-tests --image-type venv
|
||||||
if [ "${{ inputs.client-version }}" = "latest" ]; then
|
|
||||||
echo "Installing latest llama-stack-client-python from main branch"
|
|
||||||
export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
|
|
||||||
elif [ "${{ inputs.client-version }}" = "published" ]; then
|
|
||||||
echo "Installing published llama-stack-client-python from PyPI"
|
|
||||||
unset LLAMA_STACK_CLIENT_DIR
|
|
||||||
else
|
|
||||||
echo "Invalid client-version: ${{ inputs.client-version }}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Building Llama Stack"
|
|
||||||
|
|
||||||
LLAMA_STACK_DIR=. \
|
|
||||||
uv run --no-sync llama stack build --template ci-tests --image-type venv
|
|
||||||
|
|
||||||
- name: Configure git for commits
|
- name: Configure git for commits
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
12
.github/dependabot.yml
vendored
12
.github/dependabot.yml
vendored
|
@ -9,7 +9,6 @@ updates:
|
||||||
day: "saturday"
|
day: "saturday"
|
||||||
commit-message:
|
commit-message:
|
||||||
prefix: chore(github-deps)
|
prefix: chore(github-deps)
|
||||||
|
|
||||||
- package-ecosystem: "uv"
|
- package-ecosystem: "uv"
|
||||||
directory: "/"
|
directory: "/"
|
||||||
schedule:
|
schedule:
|
||||||
|
@ -20,14 +19,3 @@ updates:
|
||||||
- python
|
- python
|
||||||
commit-message:
|
commit-message:
|
||||||
prefix: chore(python-deps)
|
prefix: chore(python-deps)
|
||||||
|
|
||||||
- package-ecosystem: npm
|
|
||||||
directory: "/llama_stack/ui"
|
|
||||||
schedule:
|
|
||||||
interval: "weekly"
|
|
||||||
day: "saturday"
|
|
||||||
labels:
|
|
||||||
- type/dependencies
|
|
||||||
- javascript
|
|
||||||
commit-message:
|
|
||||||
prefix: chore(ui-deps)
|
|
||||||
|
|
1
.github/workflows/README.md
vendored
1
.github/workflows/README.md
vendored
|
@ -18,6 +18,5 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
|
||||||
| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
|
| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
|
||||||
| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
|
| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
|
||||||
| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
|
| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
|
||||||
| UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
|
|
||||||
| Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
|
| Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
|
||||||
| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
|
| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
|
||||||
|
|
2
.github/workflows/changelog.yml
vendored
2
.github/workflows/changelog.yml
vendored
|
@ -17,7 +17,7 @@ jobs:
|
||||||
pull-requests: write # for peter-evans/create-pull-request to create a PR
|
pull-requests: write # for peter-evans/create-pull-request to create a PR
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
with:
|
with:
|
||||||
ref: main
|
ref: main
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
7
.github/workflows/install-script-ci.yml
vendored
7
.github/workflows/install-script-ci.yml
vendored
|
@ -16,22 +16,21 @@ jobs:
|
||||||
lint:
|
lint:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
|
||||||
- name: Run ShellCheck on install.sh
|
- name: Run ShellCheck on install.sh
|
||||||
run: shellcheck scripts/install.sh
|
run: shellcheck scripts/install.sh
|
||||||
smoke-test-on-dev:
|
smoke-test-on-dev:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
||||||
- name: Build a single provider
|
- name: Build a single provider
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template starter --image-type container --image-name test
|
||||||
llama stack build --template starter --image-type container --image-name test
|
|
||||||
|
|
||||||
- name: Run installer end-to-end
|
- name: Run installer end-to-end
|
||||||
run: |
|
run: |
|
||||||
|
|
3
.github/workflows/integration-auth-tests.yml
vendored
3
.github/workflows/integration-auth-tests.yml
vendored
|
@ -10,7 +10,6 @@ on:
|
||||||
paths:
|
paths:
|
||||||
- 'distributions/**'
|
- 'distributions/**'
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/integration/**'
|
- 'tests/integration/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -31,7 +30,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
|
@ -44,7 +44,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
33
.github/workflows/integration-tests.yml
vendored
33
.github/workflows/integration-tests.yml
vendored
|
@ -10,7 +10,6 @@ on:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/**'
|
- 'tests/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -32,14 +31,6 @@ on:
|
||||||
description: 'Test against a specific provider'
|
description: 'Test against a specific provider'
|
||||||
type: string
|
type: string
|
||||||
default: 'ollama'
|
default: 'ollama'
|
||||||
test-subdirs:
|
|
||||||
description: 'Comma-separated list of test subdirectories to run'
|
|
||||||
type: string
|
|
||||||
default: ''
|
|
||||||
test-pattern:
|
|
||||||
description: 'Regex pattern to pass to pytest -k'
|
|
||||||
type: string
|
|
||||||
default: ''
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
# Skip concurrency for pushes to main - each commit should be tested independently
|
# Skip concurrency for pushes to main - each commit should be tested independently
|
||||||
|
@ -47,8 +38,27 @@ concurrency:
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
discover-tests:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
test-types: ${{ steps.generate-test-types.outputs.test-types }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
|
- name: Generate test types
|
||||||
|
id: generate-test-types
|
||||||
|
run: |
|
||||||
|
# Get test directories dynamically, excluding non-test directories
|
||||||
|
# NOTE: we are excluding post_training since the tests take too long
|
||||||
|
TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
|
||||||
|
grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
|
||||||
|
sort | jq -R -s -c 'split("\n")[:-1]')
|
||||||
|
echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
run-replay-mode-tests:
|
run-replay-mode-tests:
|
||||||
|
needs: discover-tests
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
|
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
|
||||||
|
|
||||||
|
@ -65,7 +75,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Setup test environment
|
- name: Setup test environment
|
||||||
uses: ./.github/actions/setup-test-environment
|
uses: ./.github/actions/setup-test-environment
|
||||||
|
@ -79,8 +89,7 @@ jobs:
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
uses: ./.github/actions/run-and-record-tests
|
uses: ./.github/actions/run-and-record-tests
|
||||||
with:
|
with:
|
||||||
test-subdirs: ${{ inputs.test-subdirs }}
|
test-types: ${{ needs.discover-tests.outputs.test-types }}
|
||||||
test-pattern: ${{ inputs.test-pattern }}
|
|
||||||
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
|
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
|
||||||
provider: ${{ matrix.provider }}
|
provider: ${{ matrix.provider }}
|
||||||
inference-mode: 'replay'
|
inference-mode: 'replay'
|
||||||
|
|
|
@ -9,17 +9,14 @@ on:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/integration/vector_io/**'
|
- 'tests/integration/vector_io/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
- 'requirements.txt'
|
- 'requirements.txt'
|
||||||
- '.github/workflows/integration-vector-io-tests.yml' # This workflow
|
- '.github/workflows/integration-vector-io-tests.yml' # This workflow
|
||||||
schedule:
|
|
||||||
- cron: '0 0 * * *' # (test on python 3.13) Daily at 12 AM UTC
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -28,12 +25,12 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
|
vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
|
||||||
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
|
python-version: ["3.12", "3.13"]
|
||||||
fail-fast: false # we want to run all tests regardless of failure
|
fail-fast: false # we want to run all tests regardless of failure
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -144,7 +141,7 @@ jobs:
|
||||||
|
|
||||||
- name: Build Llama Stack
|
- name: Build Llama Stack
|
||||||
run: |
|
run: |
|
||||||
uv run --no-sync llama stack build --template ci-tests --image-type venv
|
uv run llama stack build --template ci-tests --image-type venv
|
||||||
|
|
||||||
- name: Check Storage and Memory Available Before Tests
|
- name: Check Storage and Memory Available Before Tests
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
|
@ -167,10 +164,9 @@ jobs:
|
||||||
ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
|
ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
|
||||||
WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
|
WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
|
||||||
run: |
|
run: |
|
||||||
uv run --no-sync \
|
uv run pytest -sv --stack-config="inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
|
||||||
pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
|
|
||||||
tests/integration/vector_io \
|
tests/integration/vector_io \
|
||||||
--embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
|
--embedding-model sentence-transformers/all-MiniLM-L6-v2
|
||||||
|
|
||||||
- name: Check Storage and Memory Available After Tests
|
- name: Check Storage and Memory Available After Tests
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
|
|
17
.github/workflows/pre-commit.yml
vendored
17
.github/workflows/pre-commit.yml
vendored
|
@ -20,7 +20,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
with:
|
with:
|
||||||
# For dependabot PRs, we need to checkout with a token that can push changes
|
# For dependabot PRs, we need to checkout with a token that can push changes
|
||||||
token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
|
token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
|
||||||
|
@ -36,21 +36,6 @@ jobs:
|
||||||
**/requirements*.txt
|
**/requirements*.txt
|
||||||
.pre-commit-config.yaml
|
.pre-commit-config.yaml
|
||||||
|
|
||||||
# npm ci may fail -
|
|
||||||
# npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
|
|
||||||
# npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
|
|
||||||
|
|
||||||
# - name: Set up Node.js
|
|
||||||
# uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
|
|
||||||
# with:
|
|
||||||
# node-version: '20'
|
|
||||||
# cache: 'npm'
|
|
||||||
# cache-dependency-path: 'llama_stack/ui/'
|
|
||||||
|
|
||||||
# - name: Install npm dependencies
|
|
||||||
# run: npm ci
|
|
||||||
# working-directory: llama_stack/ui
|
|
||||||
|
|
||||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
env:
|
env:
|
||||||
|
|
10
.github/workflows/providers-build.yml
vendored
10
.github/workflows/providers-build.yml
vendored
|
@ -36,7 +36,7 @@ jobs:
|
||||||
distros: ${{ steps.set-matrix.outputs.distros }}
|
distros: ${{ steps.set-matrix.outputs.distros }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Generate Distribution List
|
- name: Generate Distribution List
|
||||||
id: set-matrix
|
id: set-matrix
|
||||||
|
@ -55,7 +55,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -79,7 +79,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -92,7 +92,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -117,7 +117,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
6
.github/workflows/python-build-test.yml
vendored
6
.github/workflows/python-build-test.yml
vendored
|
@ -9,8 +9,6 @@ on:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
paths-ignore:
|
|
||||||
- 'llama_stack/ui/**'
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
@ -21,10 +19,10 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install uv
|
- name: Install uv
|
||||||
uses: astral-sh/setup-uv@d9e0f98d3fc6adb07d1e3d37f3043649ddad06a1 # v6.5.0
|
uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
activate-environment: true
|
activate-environment: true
|
||||||
|
|
103
.github/workflows/record-integration-tests.yml
vendored
103
.github/workflows/record-integration-tests.yml
vendored
|
@ -1,53 +1,93 @@
|
||||||
# This workflow should be run manually when needing to re-record tests. This happens when you have
|
|
||||||
# - added a new test
|
|
||||||
# - or changed an existing test such that a new inference call is made
|
|
||||||
# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the
|
|
||||||
# tests and commit the recordings to the PR branch.
|
|
||||||
name: Integration Tests (Record)
|
name: Integration Tests (Record)
|
||||||
|
|
||||||
run-name: Run the integration test suite from tests/integration
|
run-name: Run the integration test suite from tests/integration
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
pull_request:
|
||||||
|
branches: [ main ]
|
||||||
|
types: [opened, synchronize, labeled]
|
||||||
|
paths:
|
||||||
|
- 'llama_stack/**'
|
||||||
|
- 'tests/**'
|
||||||
|
- 'uv.lock'
|
||||||
|
- 'pyproject.toml'
|
||||||
|
- '.github/workflows/record-integration-tests.yml' # This workflow
|
||||||
|
- '.github/actions/setup-ollama/action.yml'
|
||||||
|
- '.github/actions/setup-test-environment/action.yml'
|
||||||
|
- '.github/actions/run-and-record-tests/action.yml'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
test-subdirs:
|
|
||||||
description: 'Comma-separated list of test subdirectories to run'
|
|
||||||
type: string
|
|
||||||
default: ''
|
|
||||||
test-provider:
|
test-provider:
|
||||||
description: 'Test against a specific provider'
|
description: 'Test against a specific provider'
|
||||||
type: string
|
type: string
|
||||||
default: 'ollama'
|
default: 'ollama'
|
||||||
run-vision-tests:
|
|
||||||
description: 'Whether to run vision tests'
|
concurrency:
|
||||||
type: boolean
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
default: false
|
cancel-in-progress: true
|
||||||
test-pattern:
|
|
||||||
description: 'Regex pattern to pass to pytest -k'
|
|
||||||
type: string
|
|
||||||
default: ''
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
discover-tests:
|
||||||
|
if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
|
||||||
|
contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
test-types: ${{ steps.generate-test-types.outputs.test-types }}
|
||||||
|
matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
|
- name: Generate test types
|
||||||
|
id: generate-test-types
|
||||||
|
run: |
|
||||||
|
# Get test directories dynamically, excluding non-test directories
|
||||||
|
TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
|
||||||
|
grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
|
||||||
|
sort | jq -R -s -c 'split("\n")[:-1]')
|
||||||
|
echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
|
||||||
|
echo "labels=$labels"
|
||||||
|
|
||||||
|
modes_array=()
|
||||||
|
if [[ $labels == *"re-record-vision-tests"* ]]; then
|
||||||
|
modes_array+=("vision")
|
||||||
|
fi
|
||||||
|
if [[ $labels == *"re-record-tests"* ]]; then
|
||||||
|
modes_array+=("non-vision")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Convert to JSON array
|
||||||
|
if [ ${#modes_array[@]} -eq 0 ]; then
|
||||||
|
matrix_modes="[]"
|
||||||
|
else
|
||||||
|
matrix_modes=$(printf '%s\n' "${modes_array[@]}" | jq -R -s -c 'split("\n")[:-1]')
|
||||||
|
fi
|
||||||
|
echo "matrix_modes=$matrix_modes"
|
||||||
|
echo "matrix-modes=$matrix_modes" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
env:
|
||||||
|
GH_TOKEN: ${{ github.token }}
|
||||||
|
|
||||||
record-tests:
|
record-tests:
|
||||||
|
needs: discover-tests
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: write
|
||||||
|
|
||||||
steps:
|
strategy:
|
||||||
- name: Echo workflow inputs
|
fail-fast: false
|
||||||
run: |
|
matrix:
|
||||||
echo "::group::Workflow Inputs"
|
mode: ${{ fromJSON(needs.discover-tests.outputs.matrix-modes) }}
|
||||||
echo "test-subdirs: ${{ inputs.test-subdirs }}"
|
|
||||||
echo "test-provider: ${{ inputs.test-provider }}"
|
|
||||||
echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
|
|
||||||
echo "test-pattern: ${{ inputs.test-pattern }}"
|
|
||||||
echo "branch: ${{ github.ref_name }}"
|
|
||||||
echo "::endgroup::"
|
|
||||||
|
|
||||||
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
with:
|
with:
|
||||||
|
ref: ${{ github.event.pull_request.head.ref }}
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Setup test environment
|
- name: Setup test environment
|
||||||
|
@ -56,15 +96,14 @@ jobs:
|
||||||
python-version: "3.12" # Use single Python version for recording
|
python-version: "3.12" # Use single Python version for recording
|
||||||
client-version: "latest"
|
client-version: "latest"
|
||||||
provider: ${{ inputs.test-provider || 'ollama' }}
|
provider: ${{ inputs.test-provider || 'ollama' }}
|
||||||
run-vision-tests: ${{ inputs.run-vision-tests }}
|
run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
|
||||||
inference-mode: 'record'
|
inference-mode: 'record'
|
||||||
|
|
||||||
- name: Run and record tests
|
- name: Run and record tests
|
||||||
uses: ./.github/actions/run-and-record-tests
|
uses: ./.github/actions/run-and-record-tests
|
||||||
with:
|
with:
|
||||||
test-pattern: ${{ inputs.test-pattern }}
|
test-types: ${{ needs.discover-tests.outputs.test-types }}
|
||||||
test-subdirs: ${{ inputs.test-subdirs }}
|
|
||||||
stack-config: 'server:ci-tests' # recording must be done with server since more tests are run
|
stack-config: 'server:ci-tests' # recording must be done with server since more tests are run
|
||||||
provider: ${{ inputs.test-provider || 'ollama' }}
|
provider: ${{ inputs.test-provider || 'ollama' }}
|
||||||
inference-mode: 'record'
|
inference-mode: 'record'
|
||||||
run-vision-tests: ${{ inputs.run-vision-tests }}
|
run-vision-tests: ${{ matrix.mode == 'vision' && 'true' || 'false' }}
|
||||||
|
|
4
.github/workflows/semantic-pr.yml
vendored
4
.github/workflows/semantic-pr.yml
vendored
|
@ -11,7 +11,7 @@ on:
|
||||||
- synchronize
|
- synchronize
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
|
@ -22,6 +22,6 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check PR Title's semantic conformance
|
- name: Check PR Title's semantic conformance
|
||||||
uses: amannn/action-semantic-pull-request@7f33ba792281b034f64e96f4c0b5496782dd3b37 # v6.1.0
|
uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
|
@ -27,7 +27,7 @@ jobs:
|
||||||
# container and point 'uv pip install' to the correct path...
|
# container and point 'uv pip install' to the correct path...
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
7
.github/workflows/test-external.yml
vendored
7
.github/workflows/test-external.yml
vendored
|
@ -9,7 +9,6 @@ on:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/integration/**'
|
- 'tests/integration/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -27,7 +26,7 @@ jobs:
|
||||||
# container and point 'uv pip install' to the correct path...
|
# container and point 'uv pip install' to the correct path...
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -44,11 +43,11 @@ jobs:
|
||||||
|
|
||||||
- name: Print distro dependencies
|
- name: Print distro dependencies
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml --print-deps-only
|
||||||
|
|
||||||
- name: Build distro from config file
|
- name: Build distro from config file
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/build.yaml
|
||||||
|
|
||||||
- name: Start Llama Stack server in background
|
- name: Start Llama Stack server in background
|
||||||
if: ${{ matrix.image-type }} == 'venv'
|
if: ${{ matrix.image-type }} == 'venv'
|
||||||
|
|
55
.github/workflows/ui-unit-tests.yml
vendored
55
.github/workflows/ui-unit-tests.yml
vendored
|
@ -1,55 +0,0 @@
|
||||||
name: UI Tests
|
|
||||||
|
|
||||||
run-name: Run the UI test suite
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: [ main ]
|
|
||||||
pull_request:
|
|
||||||
branches: [ main ]
|
|
||||||
paths:
|
|
||||||
- 'llama_stack/ui/**'
|
|
||||||
- '.github/workflows/ui-unit-tests.yml' # This workflow
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
ui-tests:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
node-version: [22]
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
|
|
||||||
with:
|
|
||||||
node-version: ${{ matrix.node-version }}
|
|
||||||
cache: 'npm'
|
|
||||||
cache-dependency-path: 'llama_stack/ui/package-lock.json'
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
working-directory: llama_stack/ui
|
|
||||||
run: npm ci
|
|
||||||
|
|
||||||
- name: Run linting
|
|
||||||
working-directory: llama_stack/ui
|
|
||||||
run: npm run lint
|
|
||||||
|
|
||||||
- name: Run format check
|
|
||||||
working-directory: llama_stack/ui
|
|
||||||
run: npm run format:check
|
|
||||||
|
|
||||||
- name: Run unit tests
|
|
||||||
working-directory: llama_stack/ui
|
|
||||||
env:
|
|
||||||
CI: true
|
|
||||||
|
|
||||||
run: npm test -- --coverage --watchAll=false --passWithNoTests
|
|
3
.github/workflows/unit-tests.yml
vendored
3
.github/workflows/unit-tests.yml
vendored
|
@ -9,7 +9,6 @@ on:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/unit/**'
|
- 'tests/unit/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -32,7 +31,7 @@ jobs:
|
||||||
- "3.13"
|
- "3.13"
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
2
.github/workflows/update-readthedocs.yml
vendored
2
.github/workflows/update-readthedocs.yml
vendored
|
@ -37,7 +37,7 @@ jobs:
|
||||||
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
|
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
|
@ -2,7 +2,6 @@ exclude: 'build/'
|
||||||
|
|
||||||
default_language_version:
|
default_language_version:
|
||||||
python: python3.12
|
python: python3.12
|
||||||
node: "22"
|
|
||||||
|
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
@ -146,50 +145,6 @@ repos:
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
require_serial: true
|
require_serial: true
|
||||||
files: ^.github/workflows/.*$
|
files: ^.github/workflows/.*$
|
||||||
# ui-prettier and ui-eslint are disabled until we can avoid `npm ci`, which is slow and may fail -
|
|
||||||
# npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
|
|
||||||
# npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
|
|
||||||
# and until we have infra for installing prettier and next via npm -
|
|
||||||
# Lint UI code with ESLint.....................................................Failed
|
|
||||||
# - hook id: ui-eslint
|
|
||||||
# - exit code: 127
|
|
||||||
# > ui@0.1.0 lint
|
|
||||||
# > next lint --fix --quiet
|
|
||||||
# sh: line 1: next: command not found
|
|
||||||
#
|
|
||||||
# - id: ui-prettier
|
|
||||||
# name: Format UI code with Prettier
|
|
||||||
# entry: bash -c 'cd llama_stack/ui && npm ci && npm run format'
|
|
||||||
# language: system
|
|
||||||
# files: ^llama_stack/ui/.*\.(ts|tsx)$
|
|
||||||
# pass_filenames: false
|
|
||||||
# require_serial: true
|
|
||||||
# - id: ui-eslint
|
|
||||||
# name: Lint UI code with ESLint
|
|
||||||
# entry: bash -c 'cd llama_stack/ui && npm run lint -- --fix --quiet'
|
|
||||||
# language: system
|
|
||||||
# files: ^llama_stack/ui/.*\.(ts|tsx)$
|
|
||||||
# pass_filenames: false
|
|
||||||
# require_serial: true
|
|
||||||
|
|
||||||
- id: check-log-usage
|
|
||||||
name: Ensure 'llama_stack.log' usage for logging
|
|
||||||
entry: bash
|
|
||||||
language: system
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: true
|
|
||||||
args:
|
|
||||||
- -c
|
|
||||||
- |
|
|
||||||
matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
|
|
||||||
if [ -n "$matches" ]; then
|
|
||||||
# GitHub Actions annotation format
|
|
||||||
while IFS=: read -r file line_num rest; do
|
|
||||||
echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
|
|
||||||
done <<< "$matches"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
exit 0
|
|
||||||
|
|
||||||
ci:
|
ci:
|
||||||
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
|
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
|
||||||
|
|
170
CONTRIBUTING.md
170
CONTRIBUTING.md
|
@ -1,82 +1,13 @@
|
||||||
# Contributing to Llama Stack
|
# Contributing to Llama-Stack
|
||||||
We want to make contributing to this project as easy and transparent as
|
We want to make contributing to this project as easy and transparent as
|
||||||
possible.
|
possible.
|
||||||
|
|
||||||
## Set up your development environment
|
|
||||||
|
|
||||||
We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
|
|
||||||
You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
|
|
||||||
|
|
||||||
You can install the dependencies by running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd llama-stack
|
|
||||||
uv sync --group dev
|
|
||||||
uv pip install -e .
|
|
||||||
source .venv/bin/activate
|
|
||||||
```
|
|
||||||
|
|
||||||
```{note}
|
|
||||||
You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
|
|
||||||
Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
|
|
||||||
For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that you can create a dotenv file `.env` that includes necessary environment variables:
|
|
||||||
```
|
|
||||||
LLAMA_STACK_BASE_URL=http://localhost:8321
|
|
||||||
LLAMA_STACK_CLIENT_LOG=debug
|
|
||||||
LLAMA_STACK_PORT=8321
|
|
||||||
LLAMA_STACK_CONFIG=<provider-name>
|
|
||||||
TAVILY_SEARCH_API_KEY=
|
|
||||||
BRAVE_SEARCH_API_KEY=
|
|
||||||
```
|
|
||||||
|
|
||||||
And then use this dotenv file when running client SDK tests via the following:
|
|
||||||
```bash
|
|
||||||
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pre-commit Hooks
|
|
||||||
|
|
||||||
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv run pre-commit install
|
|
||||||
```
|
|
||||||
|
|
||||||
After that, pre-commit hooks will run automatically before each commit.
|
|
||||||
|
|
||||||
Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv run pre-commit run --all-files
|
|
||||||
```
|
|
||||||
|
|
||||||
```{caution}
|
|
||||||
Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Discussions -> Issues -> Pull Requests
|
## Discussions -> Issues -> Pull Requests
|
||||||
|
|
||||||
We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
|
We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
|
||||||
|
|
||||||
If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
|
If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
|
||||||
|
|
||||||
### Issues
|
|
||||||
We use GitHub issues to track public bugs. Please ensure your description is
|
|
||||||
clear and has sufficient instructions to be able to reproduce the issue.
|
|
||||||
|
|
||||||
Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
|
|
||||||
disclosure of security bugs. In those cases, please go through the process
|
|
||||||
outlined on that page and do not file a public issue.
|
|
||||||
|
|
||||||
### Contributor License Agreement ("CLA")
|
|
||||||
In order to accept your pull request, we need you to submit a CLA. You only need
|
|
||||||
to do this once to work on any of Meta's open source projects.
|
|
||||||
|
|
||||||
Complete your CLA here: <https://code.facebook.com/cla>
|
|
||||||
|
|
||||||
**I'd like to contribute!**
|
**I'd like to contribute!**
|
||||||
|
|
||||||
If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
|
If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
|
||||||
|
@ -120,15 +51,93 @@ Please avoid picking up too many issues at once. This helps you stay focused and
|
||||||
|
|
||||||
Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
|
Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
|
||||||
|
|
||||||
```{tip}
|
> [!TIP]
|
||||||
As a general guideline:
|
> As a general guideline:
|
||||||
- Experienced contributors should try to keep no more than 5 open PRs at a time.
|
> - Experienced contributors should try to keep no more than 5 open PRs at a time.
|
||||||
- New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
|
> - New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
|
||||||
|
|
||||||
|
## Contributor License Agreement ("CLA")
|
||||||
|
In order to accept your pull request, we need you to submit a CLA. You only need
|
||||||
|
to do this once to work on any of Meta's open source projects.
|
||||||
|
|
||||||
|
Complete your CLA here: <https://code.facebook.com/cla>
|
||||||
|
|
||||||
|
## Issues
|
||||||
|
We use GitHub issues to track public bugs. Please ensure your description is
|
||||||
|
clear and has sufficient instructions to be able to reproduce the issue.
|
||||||
|
|
||||||
|
Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
|
||||||
|
disclosure of security bugs. In those cases, please go through the process
|
||||||
|
outlined on that page and do not file a public issue.
|
||||||
|
|
||||||
|
|
||||||
|
## Set up your development environment
|
||||||
|
|
||||||
|
We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
|
||||||
|
You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
|
||||||
|
|
||||||
|
You can install the dependencies by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd llama-stack
|
||||||
|
uv sync --group dev
|
||||||
|
uv pip install -e .
|
||||||
|
source .venv/bin/activate
|
||||||
```
|
```
|
||||||
|
|
||||||
## Repository guidelines
|
> [!NOTE]
|
||||||
|
> You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`)
|
||||||
|
> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
|
||||||
|
> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
|
||||||
|
|
||||||
### Coding Style
|
Note that you can create a dotenv file `.env` that includes necessary environment variables:
|
||||||
|
```
|
||||||
|
LLAMA_STACK_BASE_URL=http://localhost:8321
|
||||||
|
LLAMA_STACK_CLIENT_LOG=debug
|
||||||
|
LLAMA_STACK_PORT=8321
|
||||||
|
LLAMA_STACK_CONFIG=<provider-name>
|
||||||
|
TAVILY_SEARCH_API_KEY=
|
||||||
|
BRAVE_SEARCH_API_KEY=
|
||||||
|
```
|
||||||
|
|
||||||
|
And then use this dotenv file when running client SDK tests via the following:
|
||||||
|
```bash
|
||||||
|
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pre-commit Hooks
|
||||||
|
|
||||||
|
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv run pre-commit install
|
||||||
|
```
|
||||||
|
|
||||||
|
After that, pre-commit hooks will run automatically before each commit.
|
||||||
|
|
||||||
|
Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv run pre-commit run --all-files
|
||||||
|
```
|
||||||
|
|
||||||
|
> [!CAUTION]
|
||||||
|
> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
|
||||||
|
|
||||||
|
## Running tests
|
||||||
|
|
||||||
|
You can find the Llama Stack testing documentation [here](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md).
|
||||||
|
|
||||||
|
## Adding a new dependency to the project
|
||||||
|
|
||||||
|
To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv add foo
|
||||||
|
uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
## Coding Style
|
||||||
|
|
||||||
* Comments should provide meaningful insights into the code. Avoid filler comments that simply
|
* Comments should provide meaningful insights into the code. Avoid filler comments that simply
|
||||||
describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
||||||
|
@ -148,11 +157,6 @@ As a general guideline:
|
||||||
that describes the configuration. These descriptions will be used to generate the provider
|
that describes the configuration. These descriptions will be used to generate the provider
|
||||||
documentation.
|
documentation.
|
||||||
* When possible, use keyword arguments only when calling functions.
|
* When possible, use keyword arguments only when calling functions.
|
||||||
* Llama Stack utilizes [custom Exception classes](llama_stack/apis/common/errors.py) for certain Resources that should be used where applicable.
|
|
||||||
|
|
||||||
### License
|
|
||||||
By contributing to Llama, you agree that your contributions will be licensed
|
|
||||||
under the LICENSE file in the root directory of this source tree.
|
|
||||||
|
|
||||||
## Common Tasks
|
## Common Tasks
|
||||||
|
|
||||||
|
@ -206,3 +210,7 @@ uv run ./docs/openapi_generator/run_openapi_generator.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
|
The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
|
||||||
|
|
||||||
|
## License
|
||||||
|
By contributing to Llama, you agree that your contributions will be licensed
|
||||||
|
under the LICENSE file in the root directory of this source tree.
|
||||||
|
|
15
README.md
15
README.md
|
@ -9,7 +9,6 @@
|
||||||
|
|
||||||
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
|
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
|
||||||
|
|
||||||
|
|
||||||
### ✨🎉 Llama 4 Support 🎉✨
|
### ✨🎉 Llama 4 Support 🎉✨
|
||||||
We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
|
We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
|
||||||
|
|
||||||
|
@ -180,17 +179,3 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
|
||||||
Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
|
Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
|
||||||
|
|
||||||
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
|
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
|
||||||
|
|
||||||
|
|
||||||
## 🌟 GitHub Star History
|
|
||||||
## Star History
|
|
||||||
|
|
||||||
[](https://www.star-history.com/#meta-llama/llama-stack&Date)
|
|
||||||
|
|
||||||
## ✨ Contributors
|
|
||||||
|
|
||||||
Thanks to all of our amazing contributors!
|
|
||||||
|
|
||||||
<a href="https://github.com/meta-llama/llama-stack/graphs/contributors">
|
|
||||||
<img src="https://contrib.rocks/image?repo=meta-llama/llama-stack" />
|
|
||||||
</a>
|
|
14
docs/_static/js/keyboard_shortcuts.js
vendored
14
docs/_static/js/keyboard_shortcuts.js
vendored
|
@ -1,14 +0,0 @@
|
||||||
document.addEventListener('keydown', function(event) {
|
|
||||||
// command+K or ctrl+K
|
|
||||||
if ((event.metaKey || event.ctrlKey) && event.key === 'k') {
|
|
||||||
event.preventDefault();
|
|
||||||
document.querySelector('.search-input, .search-field, input[name="q"]').focus();
|
|
||||||
}
|
|
||||||
|
|
||||||
// forward slash
|
|
||||||
if (event.key === '/' &&
|
|
||||||
!event.target.matches('input, textarea, select')) {
|
|
||||||
event.preventDefault();
|
|
||||||
document.querySelector('.search-input, .search-field, input[name="q"]').focus();
|
|
||||||
}
|
|
||||||
});
|
|
426
docs/_static/llama-stack-spec.html
vendored
426
docs/_static/llama-stack-spec.html
vendored
|
@ -1452,40 +1452,6 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
|
||||||
"delete": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "OK"
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"Shields"
|
|
||||||
],
|
|
||||||
"description": "Unregister a shield.",
|
|
||||||
"parameters": [
|
|
||||||
{
|
|
||||||
"name": "identifier",
|
|
||||||
"in": "path",
|
|
||||||
"description": "The identifier of the shield to unregister.",
|
|
||||||
"required": true,
|
|
||||||
"schema": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/telemetry/traces/{trace_id}/spans/{span_id}": {
|
"/v1/telemetry/traces/{trace_id}/spans/{span_id}": {
|
||||||
|
@ -4734,49 +4700,6 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/openai/v1/moderations": {
|
|
||||||
"post": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "A moderation object.",
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/ModerationObject"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"Safety"
|
|
||||||
],
|
|
||||||
"description": "Classifies if text and/or image inputs are potentially harmful.",
|
|
||||||
"parameters": [],
|
|
||||||
"requestBody": {
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/RunModerationRequest"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"/v1/safety/run-shield": {
|
"/v1/safety/run-shield": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -8293,60 +8216,28 @@
|
||||||
"type": "array",
|
"type": "array",
|
||||||
"items": {
|
"items": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"additionalProperties": {
|
||||||
"attributes": {
|
"oneOf": [
|
||||||
"type": "object",
|
{
|
||||||
"additionalProperties": {
|
"type": "null"
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"type": "null"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "boolean"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "array"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "object"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"description": "(Optional) Key-value attributes associated with the file"
|
{
|
||||||
},
|
"type": "boolean"
|
||||||
"file_id": {
|
},
|
||||||
"type": "string",
|
{
|
||||||
"description": "Unique identifier of the file containing the result"
|
"type": "number"
|
||||||
},
|
},
|
||||||
"filename": {
|
{
|
||||||
"type": "string",
|
"type": "string"
|
||||||
"description": "Name of the file containing the result"
|
},
|
||||||
},
|
{
|
||||||
"score": {
|
"type": "array"
|
||||||
"type": "number",
|
},
|
||||||
"description": "Relevance score for this search result (between 0 and 1)"
|
{
|
||||||
},
|
"type": "object"
|
||||||
"text": {
|
}
|
||||||
"type": "string",
|
]
|
||||||
"description": "Text content of the search result"
|
}
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"attributes",
|
|
||||||
"file_id",
|
|
||||||
"filename",
|
|
||||||
"score",
|
|
||||||
"text"
|
|
||||||
],
|
|
||||||
"title": "OpenAIResponseOutputMessageFileSearchToolCallResults",
|
|
||||||
"description": "Search results returned by the file search operation."
|
|
||||||
},
|
},
|
||||||
"description": "(Optional) Search results returned by the file search operation"
|
"description": "(Optional) Search results returned by the file search operation"
|
||||||
}
|
}
|
||||||
|
@ -8547,13 +8438,6 @@
|
||||||
"$ref": "#/components/schemas/OpenAIResponseInputTool"
|
"$ref": "#/components/schemas/OpenAIResponseInputTool"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"include": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"description": "(Optional) Additional fields to include in the response."
|
|
||||||
},
|
|
||||||
"max_infer_iters": {
|
"max_infer_iters": {
|
||||||
"type": "integer"
|
"type": "integer"
|
||||||
}
|
}
|
||||||
|
@ -8821,61 +8705,6 @@
|
||||||
"title": "OpenAIResponseOutputMessageMCPListTools",
|
"title": "OpenAIResponseOutputMessageMCPListTools",
|
||||||
"description": "MCP list tools output message containing available tools from an MCP server."
|
"description": "MCP list tools output message containing available tools from an MCP server."
|
||||||
},
|
},
|
||||||
"OpenAIResponseContentPart": {
|
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/OpenAIResponseContentPartOutputText"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/OpenAIResponseContentPartRefusal"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"discriminator": {
|
|
||||||
"propertyName": "type",
|
|
||||||
"mapping": {
|
|
||||||
"output_text": "#/components/schemas/OpenAIResponseContentPartOutputText",
|
|
||||||
"refusal": "#/components/schemas/OpenAIResponseContentPartRefusal"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"OpenAIResponseContentPartOutputText": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"const": "output_text",
|
|
||||||
"default": "output_text"
|
|
||||||
},
|
|
||||||
"text": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"type",
|
|
||||||
"text"
|
|
||||||
],
|
|
||||||
"title": "OpenAIResponseContentPartOutputText"
|
|
||||||
},
|
|
||||||
"OpenAIResponseContentPartRefusal": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"const": "refusal",
|
|
||||||
"default": "refusal"
|
|
||||||
},
|
|
||||||
"refusal": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"type",
|
|
||||||
"refusal"
|
|
||||||
],
|
|
||||||
"title": "OpenAIResponseContentPartRefusal"
|
|
||||||
},
|
|
||||||
"OpenAIResponseObjectStream": {
|
"OpenAIResponseObjectStream": {
|
||||||
"oneOf": [
|
"oneOf": [
|
||||||
{
|
{
|
||||||
|
@ -8932,12 +8761,6 @@
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
|
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
"$ref": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
||||||
}
|
}
|
||||||
|
@ -8963,8 +8786,6 @@
|
||||||
"response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
|
"response.mcp_call.in_progress": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress",
|
||||||
"response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
|
"response.mcp_call.failed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed",
|
||||||
"response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
|
"response.mcp_call.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted",
|
||||||
"response.content_part.added": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded",
|
|
||||||
"response.content_part.done": "#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone",
|
|
||||||
"response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
"response.completed": "#/components/schemas/OpenAIResponseObjectStreamResponseCompleted"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8991,80 +8812,6 @@
|
||||||
"title": "OpenAIResponseObjectStreamResponseCompleted",
|
"title": "OpenAIResponseObjectStreamResponseCompleted",
|
||||||
"description": "Streaming event indicating a response has been completed."
|
"description": "Streaming event indicating a response has been completed."
|
||||||
},
|
},
|
||||||
"OpenAIResponseObjectStreamResponseContentPartAdded": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"response_id": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Unique identifier of the response containing this content"
|
|
||||||
},
|
|
||||||
"item_id": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Unique identifier of the output item containing this content part"
|
|
||||||
},
|
|
||||||
"part": {
|
|
||||||
"$ref": "#/components/schemas/OpenAIResponseContentPart",
|
|
||||||
"description": "The content part that was added"
|
|
||||||
},
|
|
||||||
"sequence_number": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "Sequential number for ordering streaming events"
|
|
||||||
},
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"const": "response.content_part.added",
|
|
||||||
"default": "response.content_part.added",
|
|
||||||
"description": "Event type identifier, always \"response.content_part.added\""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"response_id",
|
|
||||||
"item_id",
|
|
||||||
"part",
|
|
||||||
"sequence_number",
|
|
||||||
"type"
|
|
||||||
],
|
|
||||||
"title": "OpenAIResponseObjectStreamResponseContentPartAdded",
|
|
||||||
"description": "Streaming event for when a new content part is added to a response item."
|
|
||||||
},
|
|
||||||
"OpenAIResponseObjectStreamResponseContentPartDone": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"response_id": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Unique identifier of the response containing this content"
|
|
||||||
},
|
|
||||||
"item_id": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Unique identifier of the output item containing this content part"
|
|
||||||
},
|
|
||||||
"part": {
|
|
||||||
"$ref": "#/components/schemas/OpenAIResponseContentPart",
|
|
||||||
"description": "The completed content part"
|
|
||||||
},
|
|
||||||
"sequence_number": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "Sequential number for ordering streaming events"
|
|
||||||
},
|
|
||||||
"type": {
|
|
||||||
"type": "string",
|
|
||||||
"const": "response.content_part.done",
|
|
||||||
"default": "response.content_part.done",
|
|
||||||
"description": "Event type identifier, always \"response.content_part.done\""
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"response_id",
|
|
||||||
"item_id",
|
|
||||||
"part",
|
|
||||||
"sequence_number",
|
|
||||||
"type"
|
|
||||||
],
|
|
||||||
"title": "OpenAIResponseObjectStreamResponseContentPartDone",
|
|
||||||
"description": "Streaming event for when a content part is completed."
|
|
||||||
},
|
|
||||||
"OpenAIResponseObjectStreamResponseCreated": {
|
"OpenAIResponseObjectStreamResponseCreated": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -14767,8 +14514,7 @@
|
||||||
"OpenAIFilePurpose": {
|
"OpenAIFilePurpose": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"enum": [
|
||||||
"assistants",
|
"assistants"
|
||||||
"batch"
|
|
||||||
],
|
],
|
||||||
"title": "OpenAIFilePurpose",
|
"title": "OpenAIFilePurpose",
|
||||||
"description": "Valid purpose values for OpenAI Files API."
|
"description": "Valid purpose values for OpenAI Files API."
|
||||||
|
@ -14845,8 +14591,7 @@
|
||||||
"purpose": {
|
"purpose": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"enum": [
|
||||||
"assistants",
|
"assistants"
|
||||||
"batch"
|
|
||||||
],
|
],
|
||||||
"description": "The intended purpose of the file"
|
"description": "The intended purpose of the file"
|
||||||
}
|
}
|
||||||
|
@ -16622,131 +16367,6 @@
|
||||||
],
|
],
|
||||||
"title": "RunEvalRequest"
|
"title": "RunEvalRequest"
|
||||||
},
|
},
|
||||||
"RunModerationRequest": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"input": {
|
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"description": "Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models."
|
|
||||||
},
|
|
||||||
"model": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The content moderation model you would like to use."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"input",
|
|
||||||
"model"
|
|
||||||
],
|
|
||||||
"title": "RunModerationRequest"
|
|
||||||
},
|
|
||||||
"ModerationObject": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"id": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The unique identifier for the moderation request."
|
|
||||||
},
|
|
||||||
"model": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The model used to generate the moderation results."
|
|
||||||
},
|
|
||||||
"results": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"$ref": "#/components/schemas/ModerationObjectResults"
|
|
||||||
},
|
|
||||||
"description": "A list of moderation objects"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"id",
|
|
||||||
"model",
|
|
||||||
"results"
|
|
||||||
],
|
|
||||||
"title": "ModerationObject",
|
|
||||||
"description": "A moderation object."
|
|
||||||
},
|
|
||||||
"ModerationObjectResults": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"flagged": {
|
|
||||||
"type": "boolean",
|
|
||||||
"description": "Whether any of the below categories are flagged."
|
|
||||||
},
|
|
||||||
"categories": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {
|
|
||||||
"type": "boolean"
|
|
||||||
},
|
|
||||||
"description": "A list of the categories, and whether they are flagged or not."
|
|
||||||
},
|
|
||||||
"category_applied_input_types": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "string"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"description": "A list of the categories along with the input type(s) that the score applies to."
|
|
||||||
},
|
|
||||||
"category_scores": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
"description": "A list of the categories along with their scores as predicted by model."
|
|
||||||
},
|
|
||||||
"user_message": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"metadata": {
|
|
||||||
"type": "object",
|
|
||||||
"additionalProperties": {
|
|
||||||
"oneOf": [
|
|
||||||
{
|
|
||||||
"type": "null"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "boolean"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "number"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "array"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "object"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"flagged",
|
|
||||||
"metadata"
|
|
||||||
],
|
|
||||||
"title": "ModerationObjectResults",
|
|
||||||
"description": "A moderation object."
|
|
||||||
},
|
|
||||||
"RunShieldRequest": {
|
"RunShieldRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
310
docs/_static/llama-stack-spec.yaml
vendored
310
docs/_static/llama-stack-spec.yaml
vendored
|
@ -999,31 +999,6 @@ paths:
|
||||||
required: true
|
required: true
|
||||||
schema:
|
schema:
|
||||||
type: string
|
type: string
|
||||||
delete:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: OK
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- Shields
|
|
||||||
description: Unregister a shield.
|
|
||||||
parameters:
|
|
||||||
- name: identifier
|
|
||||||
in: path
|
|
||||||
description: >-
|
|
||||||
The identifier of the shield to unregister.
|
|
||||||
required: true
|
|
||||||
schema:
|
|
||||||
type: string
|
|
||||||
/v1/telemetry/traces/{trace_id}/spans/{span_id}:
|
/v1/telemetry/traces/{trace_id}/spans/{span_id}:
|
||||||
get:
|
get:
|
||||||
responses:
|
responses:
|
||||||
|
@ -3358,36 +3333,6 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/RunEvalRequest'
|
$ref: '#/components/schemas/RunEvalRequest'
|
||||||
required: true
|
required: true
|
||||||
/v1/openai/v1/moderations:
|
|
||||||
post:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: A moderation object.
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/ModerationObject'
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- Safety
|
|
||||||
description: >-
|
|
||||||
Classifies if text and/or image inputs are potentially harmful.
|
|
||||||
parameters: []
|
|
||||||
requestBody:
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/RunModerationRequest'
|
|
||||||
required: true
|
|
||||||
/v1/safety/run-shield:
|
/v1/safety/run-shield:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
|
@ -6021,44 +5966,14 @@ components:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
additionalProperties:
|
||||||
attributes:
|
oneOf:
|
||||||
type: object
|
- type: 'null'
|
||||||
additionalProperties:
|
- type: boolean
|
||||||
oneOf:
|
- type: number
|
||||||
- type: 'null'
|
- type: string
|
||||||
- type: boolean
|
- type: array
|
||||||
- type: number
|
- type: object
|
||||||
- type: string
|
|
||||||
- type: array
|
|
||||||
- type: object
|
|
||||||
description: >-
|
|
||||||
(Optional) Key-value attributes associated with the file
|
|
||||||
file_id:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
Unique identifier of the file containing the result
|
|
||||||
filename:
|
|
||||||
type: string
|
|
||||||
description: Name of the file containing the result
|
|
||||||
score:
|
|
||||||
type: number
|
|
||||||
description: >-
|
|
||||||
Relevance score for this search result (between 0 and 1)
|
|
||||||
text:
|
|
||||||
type: string
|
|
||||||
description: Text content of the search result
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- attributes
|
|
||||||
- file_id
|
|
||||||
- filename
|
|
||||||
- score
|
|
||||||
- text
|
|
||||||
title: >-
|
|
||||||
OpenAIResponseOutputMessageFileSearchToolCallResults
|
|
||||||
description: >-
|
|
||||||
Search results returned by the file search operation.
|
|
||||||
description: >-
|
description: >-
|
||||||
(Optional) Search results returned by the file search operation
|
(Optional) Search results returned by the file search operation
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
|
@ -6218,12 +6133,6 @@ components:
|
||||||
type: array
|
type: array
|
||||||
items:
|
items:
|
||||||
$ref: '#/components/schemas/OpenAIResponseInputTool'
|
$ref: '#/components/schemas/OpenAIResponseInputTool'
|
||||||
include:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
(Optional) Additional fields to include in the response.
|
|
||||||
max_infer_iters:
|
max_infer_iters:
|
||||||
type: integer
|
type: integer
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
|
@ -6441,43 +6350,6 @@ components:
|
||||||
title: OpenAIResponseOutputMessageMCPListTools
|
title: OpenAIResponseOutputMessageMCPListTools
|
||||||
description: >-
|
description: >-
|
||||||
MCP list tools output message containing available tools from an MCP server.
|
MCP list tools output message containing available tools from an MCP server.
|
||||||
OpenAIResponseContentPart:
|
|
||||||
oneOf:
|
|
||||||
- $ref: '#/components/schemas/OpenAIResponseContentPartOutputText'
|
|
||||||
- $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
|
|
||||||
discriminator:
|
|
||||||
propertyName: type
|
|
||||||
mapping:
|
|
||||||
output_text: '#/components/schemas/OpenAIResponseContentPartOutputText'
|
|
||||||
refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
|
|
||||||
OpenAIResponseContentPartOutputText:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
type:
|
|
||||||
type: string
|
|
||||||
const: output_text
|
|
||||||
default: output_text
|
|
||||||
text:
|
|
||||||
type: string
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- type
|
|
||||||
- text
|
|
||||||
title: OpenAIResponseContentPartOutputText
|
|
||||||
OpenAIResponseContentPartRefusal:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
type:
|
|
||||||
type: string
|
|
||||||
const: refusal
|
|
||||||
default: refusal
|
|
||||||
refusal:
|
|
||||||
type: string
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- type
|
|
||||||
- refusal
|
|
||||||
title: OpenAIResponseContentPartRefusal
|
|
||||||
OpenAIResponseObjectStream:
|
OpenAIResponseObjectStream:
|
||||||
oneOf:
|
oneOf:
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
|
||||||
|
@ -6498,8 +6370,6 @@ components:
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
|
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
|
|
||||||
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
||||||
discriminator:
|
discriminator:
|
||||||
propertyName: type
|
propertyName: type
|
||||||
|
@ -6522,8 +6392,6 @@ components:
|
||||||
response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
|
response.mcp_call.in_progress: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallInProgress'
|
||||||
response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
|
response.mcp_call.failed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallFailed'
|
||||||
response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
|
response.mcp_call.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseMcpCallCompleted'
|
||||||
response.content_part.added: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartAdded'
|
|
||||||
response.content_part.done: '#/components/schemas/OpenAIResponseObjectStreamResponseContentPartDone'
|
|
||||||
response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
response.completed: '#/components/schemas/OpenAIResponseObjectStreamResponseCompleted'
|
||||||
"OpenAIResponseObjectStreamResponseCompleted":
|
"OpenAIResponseObjectStreamResponseCompleted":
|
||||||
type: object
|
type: object
|
||||||
|
@ -6545,76 +6413,6 @@ components:
|
||||||
OpenAIResponseObjectStreamResponseCompleted
|
OpenAIResponseObjectStreamResponseCompleted
|
||||||
description: >-
|
description: >-
|
||||||
Streaming event indicating a response has been completed.
|
Streaming event indicating a response has been completed.
|
||||||
"OpenAIResponseObjectStreamResponseContentPartAdded":
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
response_id:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
Unique identifier of the response containing this content
|
|
||||||
item_id:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
Unique identifier of the output item containing this content part
|
|
||||||
part:
|
|
||||||
$ref: '#/components/schemas/OpenAIResponseContentPart'
|
|
||||||
description: The content part that was added
|
|
||||||
sequence_number:
|
|
||||||
type: integer
|
|
||||||
description: >-
|
|
||||||
Sequential number for ordering streaming events
|
|
||||||
type:
|
|
||||||
type: string
|
|
||||||
const: response.content_part.added
|
|
||||||
default: response.content_part.added
|
|
||||||
description: >-
|
|
||||||
Event type identifier, always "response.content_part.added"
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- response_id
|
|
||||||
- item_id
|
|
||||||
- part
|
|
||||||
- sequence_number
|
|
||||||
- type
|
|
||||||
title: >-
|
|
||||||
OpenAIResponseObjectStreamResponseContentPartAdded
|
|
||||||
description: >-
|
|
||||||
Streaming event for when a new content part is added to a response item.
|
|
||||||
"OpenAIResponseObjectStreamResponseContentPartDone":
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
response_id:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
Unique identifier of the response containing this content
|
|
||||||
item_id:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
Unique identifier of the output item containing this content part
|
|
||||||
part:
|
|
||||||
$ref: '#/components/schemas/OpenAIResponseContentPart'
|
|
||||||
description: The completed content part
|
|
||||||
sequence_number:
|
|
||||||
type: integer
|
|
||||||
description: >-
|
|
||||||
Sequential number for ordering streaming events
|
|
||||||
type:
|
|
||||||
type: string
|
|
||||||
const: response.content_part.done
|
|
||||||
default: response.content_part.done
|
|
||||||
description: >-
|
|
||||||
Event type identifier, always "response.content_part.done"
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- response_id
|
|
||||||
- item_id
|
|
||||||
- part
|
|
||||||
- sequence_number
|
|
||||||
- type
|
|
||||||
title: >-
|
|
||||||
OpenAIResponseObjectStreamResponseContentPartDone
|
|
||||||
description: >-
|
|
||||||
Streaming event for when a content part is completed.
|
|
||||||
"OpenAIResponseObjectStreamResponseCreated":
|
"OpenAIResponseObjectStreamResponseCreated":
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -10951,7 +10749,6 @@ components:
|
||||||
type: string
|
type: string
|
||||||
enum:
|
enum:
|
||||||
- assistants
|
- assistants
|
||||||
- batch
|
|
||||||
title: OpenAIFilePurpose
|
title: OpenAIFilePurpose
|
||||||
description: >-
|
description: >-
|
||||||
Valid purpose values for OpenAI Files API.
|
Valid purpose values for OpenAI Files API.
|
||||||
|
@ -11020,7 +10817,6 @@ components:
|
||||||
type: string
|
type: string
|
||||||
enum:
|
enum:
|
||||||
- assistants
|
- assistants
|
||||||
- batch
|
|
||||||
description: The intended purpose of the file
|
description: The intended purpose of the file
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
|
@ -12363,96 +12159,6 @@ components:
|
||||||
required:
|
required:
|
||||||
- benchmark_config
|
- benchmark_config
|
||||||
title: RunEvalRequest
|
title: RunEvalRequest
|
||||||
RunModerationRequest:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
input:
|
|
||||||
oneOf:
|
|
||||||
- type: string
|
|
||||||
- type: array
|
|
||||||
items:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
Input (or inputs) to classify. Can be a single string, an array of strings,
|
|
||||||
or an array of multi-modal input objects similar to other models.
|
|
||||||
model:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
The content moderation model you would like to use.
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- input
|
|
||||||
- model
|
|
||||||
title: RunModerationRequest
|
|
||||||
ModerationObject:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
id:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
The unique identifier for the moderation request.
|
|
||||||
model:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
The model used to generate the moderation results.
|
|
||||||
results:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
$ref: '#/components/schemas/ModerationObjectResults'
|
|
||||||
description: A list of moderation objects
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- id
|
|
||||||
- model
|
|
||||||
- results
|
|
||||||
title: ModerationObject
|
|
||||||
description: A moderation object.
|
|
||||||
ModerationObjectResults:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
flagged:
|
|
||||||
type: boolean
|
|
||||||
description: >-
|
|
||||||
Whether any of the below categories are flagged.
|
|
||||||
categories:
|
|
||||||
type: object
|
|
||||||
additionalProperties:
|
|
||||||
type: boolean
|
|
||||||
description: >-
|
|
||||||
A list of the categories, and whether they are flagged or not.
|
|
||||||
category_applied_input_types:
|
|
||||||
type: object
|
|
||||||
additionalProperties:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
A list of the categories along with the input type(s) that the score applies
|
|
||||||
to.
|
|
||||||
category_scores:
|
|
||||||
type: object
|
|
||||||
additionalProperties:
|
|
||||||
type: number
|
|
||||||
description: >-
|
|
||||||
A list of the categories along with their scores as predicted by model.
|
|
||||||
user_message:
|
|
||||||
type: string
|
|
||||||
metadata:
|
|
||||||
type: object
|
|
||||||
additionalProperties:
|
|
||||||
oneOf:
|
|
||||||
- type: 'null'
|
|
||||||
- type: boolean
|
|
||||||
- type: number
|
|
||||||
- type: string
|
|
||||||
- type: array
|
|
||||||
- type: object
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- flagged
|
|
||||||
- metadata
|
|
||||||
title: ModerationObjectResults
|
|
||||||
description: A moderation object.
|
|
||||||
RunShieldRequest:
|
RunShieldRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
|
|
@ -111,7 +111,7 @@ name = "llama-stack-api-weather"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Weather API for Llama Stack"
|
description = "Weather API for Llama Stack"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.10"
|
||||||
dependencies = ["llama-stack", "pydantic"]
|
dependencies = ["llama-stack", "pydantic"]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
@ -231,7 +231,7 @@ name = "llama-stack-provider-kaze"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Kaze weather provider for Llama Stack"
|
description = "Kaze weather provider for Llama Stack"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.10"
|
||||||
dependencies = ["llama-stack", "pydantic", "aiohttp"]
|
dependencies = ["llama-stack", "pydantic", "aiohttp"]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|
|
@ -2,9 +2,7 @@
|
||||||
|
|
||||||
Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.
|
Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.
|
||||||
|
|
||||||
```{note}
|
> **Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
|
||||||
For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
|
|
|
@ -76,9 +76,7 @@ Features:
|
||||||
- Context retrieval with token limits
|
- Context retrieval with token limits
|
||||||
|
|
||||||
|
|
||||||
```{note}
|
> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
|
||||||
By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Model Context Protocol (MCP)
|
## Model Context Protocol (MCP)
|
||||||
|
|
||||||
|
|
|
@ -18,4 +18,3 @@ We are working on adding a few more APIs to complete the application lifecycle.
|
||||||
- **Batch Inference**: run inference on a dataset of inputs
|
- **Batch Inference**: run inference on a dataset of inputs
|
||||||
- **Batch Agents**: run agents on a dataset of inputs
|
- **Batch Agents**: run agents on a dataset of inputs
|
||||||
- **Synthetic Data Generation**: generate synthetic data for model development
|
- **Synthetic Data Generation**: generate synthetic data for model development
|
||||||
- **Batches**: OpenAI-compatible batch management for inference
|
|
||||||
|
|
|
@ -131,7 +131,6 @@ html_static_path = ["../_static"]
|
||||||
def setup(app):
|
def setup(app):
|
||||||
app.add_css_file("css/my_theme.css")
|
app.add_css_file("css/my_theme.css")
|
||||||
app.add_js_file("js/detect_theme.js")
|
app.add_js_file("js/detect_theme.js")
|
||||||
app.add_js_file("js/keyboard_shortcuts.js")
|
|
||||||
|
|
||||||
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||||
url = f"https://hub.docker.com/r/llamastack/{text}"
|
url = f"https://hub.docker.com/r/llamastack/{text}"
|
||||||
|
|
|
@ -2,38 +2,14 @@
|
||||||
```{include} ../../../CONTRIBUTING.md
|
```{include} ../../../CONTRIBUTING.md
|
||||||
```
|
```
|
||||||
|
|
||||||
## Adding a New Provider
|
See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
|
||||||
|
|
||||||
|
|
||||||
See:
|
|
||||||
- [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
|
|
||||||
- [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
|
|
||||||
- [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
|
|
||||||
|
|
||||||
```{toctree}
|
```{toctree}
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:hidden:
|
:hidden:
|
||||||
|
|
||||||
new_api_provider
|
new_api_provider
|
||||||
new_vector_database
|
testing
|
||||||
```
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
|
|
||||||
```{include} ../../../tests/README.md
|
|
||||||
```
|
|
||||||
|
|
||||||
## Advanced Topics
|
|
||||||
|
|
||||||
For developers who need deeper understanding of the testing system internals:
|
|
||||||
|
|
||||||
```{toctree}
|
|
||||||
:maxdepth: 1
|
|
||||||
|
|
||||||
testing/record-replay
|
|
||||||
```
|
|
||||||
|
|
||||||
### Benchmarking
|
|
||||||
|
|
||||||
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,75 +0,0 @@
|
||||||
# Adding a New Vector Database
|
|
||||||
|
|
||||||
This guide will walk you through the process of adding a new vector database to Llama Stack.
|
|
||||||
|
|
||||||
> **_NOTE:_** Here's an example Pull Request of the [Milvus Vector Database Provider](https://github.com/meta-llama/llama-stack/pull/1467).
|
|
||||||
|
|
||||||
Vector Database providers are used to store and retrieve vector embeddings. Vector databases are not limited to vector
|
|
||||||
search but can support keyword and hybrid search. Additionally, vector database can also support operations like
|
|
||||||
filtering, sorting, and aggregating vectors.
|
|
||||||
|
|
||||||
## Steps to Add a New Vector Database Provider
|
|
||||||
1. **Choose the Database Type**: Determine if your vector database is a remote service, inline, or both.
|
|
||||||
- Remote databases make requests to external services, while inline databases execute locally. Some providers support both.
|
|
||||||
2. **Implement the Provider**: Create a new provider class that inherits from `VectorDatabaseProvider` and implements the required methods.
|
|
||||||
- Implement methods for vector storage, retrieval, search, and any additional features your database supports.
|
|
||||||
- You will need to implement the following methods for `YourVectorIndex`:
|
|
||||||
- `YourVectorIndex.create()`
|
|
||||||
- `YourVectorIndex.initialize()`
|
|
||||||
- `YourVectorIndex.add_chunks()`
|
|
||||||
- `YourVectorIndex.delete_chunk()`
|
|
||||||
- `YourVectorIndex.query_vector()`
|
|
||||||
- `YourVectorIndex.query_keyword()`
|
|
||||||
- `YourVectorIndex.query_hybrid()`
|
|
||||||
- You will need to implement the following methods for `YourVectorIOAdapter`:
|
|
||||||
- `YourVectorIOAdapter.initialize()`
|
|
||||||
- `YourVectorIOAdapter.shutdown()`
|
|
||||||
- `YourVectorIOAdapter.list_vector_dbs()`
|
|
||||||
- `YourVectorIOAdapter.register_vector_db()`
|
|
||||||
- `YourVectorIOAdapter.unregister_vector_db()`
|
|
||||||
- `YourVectorIOAdapter.insert_chunks()`
|
|
||||||
- `YourVectorIOAdapter.query_chunks()`
|
|
||||||
- `YourVectorIOAdapter.delete_chunks()`
|
|
||||||
3. **Add to Registry**: Register your provider in the appropriate registry file.
|
|
||||||
- Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider.
|
|
||||||
```python
|
|
||||||
from llama_stack.providers.registry.specs import InlineProviderSpec
|
|
||||||
from llama_stack.providers.registry.api import Api
|
|
||||||
|
|
||||||
InlineProviderSpec(
|
|
||||||
api=Api.vector_io,
|
|
||||||
provider_type="inline::milvus",
|
|
||||||
pip_packages=["pymilvus>=2.4.10"],
|
|
||||||
module="llama_stack.providers.inline.vector_io.milvus",
|
|
||||||
config_class="llama_stack.providers.inline.vector_io.milvus.MilvusVectorIOConfig",
|
|
||||||
api_dependencies=[Api.inference],
|
|
||||||
optional_api_dependencies=[Api.files],
|
|
||||||
description="",
|
|
||||||
),
|
|
||||||
```
|
|
||||||
4. **Add Tests**: Create unit tests and integration tests for your provider in the `tests/` directory.
|
|
||||||
- Unit Tests
|
|
||||||
- By following the structure of the class methods, you will be able to easily run unit and integration tests for your database.
|
|
||||||
1. You have to configure the tests for your provide in `/tests/unit/providers/vector_io/conftest.py`.
|
|
||||||
2. Update the `vector_provider` fixture to include your provider if they are an inline provider.
|
|
||||||
3. Create a `your_vectorprovider_index` fixture that initializes your vector index.
|
|
||||||
4. Create a `your_vectorprovider_adapter` fixture that initializes your vector adapter.
|
|
||||||
5. Add your provider to the `vector_io_providers` fixture dictionary.
|
|
||||||
- Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly.
|
|
||||||
- Integration Tests
|
|
||||||
- Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
|
|
||||||
- The two set of integration tests are:
|
|
||||||
- `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval.
|
|
||||||
- `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility.
|
|
||||||
- You will need to update `skip_if_provider_doesnt_support_openai_vector_stores` to include your provider as well as `skip_if_provider_doesnt_support_openai_vector_stores_search` to test the appropriate search functionality.
|
|
||||||
- Running the tests in the GitHub CI
|
|
||||||
- You will need to update the `.github/workflows/integration-vector-io-tests.yml` file to include your provider.
|
|
||||||
- If your provider is a remote provider, you will also have to add a container to spin up and run it in the action.
|
|
||||||
- Updating the pyproject.yml
|
|
||||||
- If you are adding tests for the `inline` provider you will have to update the `unit` group.
|
|
||||||
- `uv add new_pip_package --group unit`
|
|
||||||
- If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests.
|
|
||||||
- `uv add new_pip_package --group test`
|
|
||||||
5. **Update Documentation**: Please update the documentation for end users
|
|
||||||
- Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`.
|
|
||||||
- Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples.
|
|
6
docs/source/contributing/testing.md
Normal file
6
docs/source/contributing/testing.md
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
# Testing Llama Stack
|
||||||
|
|
||||||
|
Tests are of three different kinds:
|
||||||
|
- Unit tests
|
||||||
|
- Provider focused integration tests
|
||||||
|
- Client SDK tests
|
|
@ -1,234 +0,0 @@
|
||||||
# Record-Replay System
|
|
||||||
|
|
||||||
Understanding how Llama Stack captures and replays API interactions for testing.
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The record-replay system solves a fundamental challenge in AI testing: how do you test against expensive, non-deterministic APIs without breaking the bank or dealing with flaky tests?
|
|
||||||
|
|
||||||
The solution: intercept API calls, store real responses, and replay them later. This gives you real API behavior without the cost or variability.
|
|
||||||
|
|
||||||
## How It Works
|
|
||||||
|
|
||||||
### Request Hashing
|
|
||||||
|
|
||||||
Every API request gets converted to a deterministic hash for lookup:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def normalize_request(method: str, url: str, headers: dict, body: dict) -> str:
|
|
||||||
normalized = {
|
|
||||||
"method": method.upper(),
|
|
||||||
"endpoint": urlparse(url).path, # Just the path, not full URL
|
|
||||||
"body": body, # Request parameters
|
|
||||||
}
|
|
||||||
return hashlib.sha256(json.dumps(normalized, sort_keys=True).encode()).hexdigest()
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key insight:** The hashing is intentionally precise. Different whitespace, float precision, or parameter order produces different hashes. This prevents subtle bugs from false cache hits.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# These produce DIFFERENT hashes:
|
|
||||||
{"content": "Hello world"}
|
|
||||||
{"content": "Hello world\n"}
|
|
||||||
{"temperature": 0.7}
|
|
||||||
{"temperature": 0.7000001}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Client Interception
|
|
||||||
|
|
||||||
The system patches OpenAI and Ollama client methods to intercept calls before they leave your application. This happens transparently - your test code doesn't change.
|
|
||||||
|
|
||||||
### Storage Architecture
|
|
||||||
|
|
||||||
Recordings use a two-tier storage system optimized for both speed and debuggability:
|
|
||||||
|
|
||||||
```
|
|
||||||
recordings/
|
|
||||||
├── index.sqlite # Fast lookup by request hash
|
|
||||||
└── responses/
|
|
||||||
├── abc123def456.json # Individual response files
|
|
||||||
└── def789ghi012.json
|
|
||||||
```
|
|
||||||
|
|
||||||
**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
|
|
||||||
|
|
||||||
**JSON files** store complete request/response pairs in human-readable format for debugging.
|
|
||||||
|
|
||||||
## Recording Modes
|
|
||||||
|
|
||||||
### LIVE Mode
|
|
||||||
|
|
||||||
Direct API calls with no recording or replay:
|
|
||||||
|
|
||||||
```python
|
|
||||||
with inference_recording(mode=InferenceMode.LIVE):
|
|
||||||
response = await client.chat.completions.create(...)
|
|
||||||
```
|
|
||||||
|
|
||||||
Use for initial development and debugging against real APIs.
|
|
||||||
|
|
||||||
### RECORD Mode
|
|
||||||
|
|
||||||
Captures API interactions while passing through real responses:
|
|
||||||
|
|
||||||
```python
|
|
||||||
with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
|
|
||||||
response = await client.chat.completions.create(...)
|
|
||||||
# Real API call made, response captured AND returned
|
|
||||||
```
|
|
||||||
|
|
||||||
The recording process:
|
|
||||||
1. Request intercepted and hashed
|
|
||||||
2. Real API call executed
|
|
||||||
3. Response captured and serialized
|
|
||||||
4. Recording stored to disk
|
|
||||||
5. Original response returned to caller
|
|
||||||
|
|
||||||
### REPLAY Mode
|
|
||||||
|
|
||||||
Returns stored responses instead of making API calls:
|
|
||||||
|
|
||||||
```python
|
|
||||||
with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
|
|
||||||
response = await client.chat.completions.create(...)
|
|
||||||
# No API call made, cached response returned instantly
|
|
||||||
```
|
|
||||||
|
|
||||||
The replay process:
|
|
||||||
1. Request intercepted and hashed
|
|
||||||
2. Hash looked up in SQLite index
|
|
||||||
3. Response loaded from JSON file
|
|
||||||
4. Response deserialized and returned
|
|
||||||
5. Error if no recording found
|
|
||||||
|
|
||||||
## Streaming Support
|
|
||||||
|
|
||||||
Streaming APIs present a unique challenge: how do you capture an async generator?
|
|
||||||
|
|
||||||
### The Problem
|
|
||||||
|
|
||||||
```python
|
|
||||||
# How do you record this?
|
|
||||||
async for chunk in client.chat.completions.create(stream=True):
|
|
||||||
process(chunk)
|
|
||||||
```
|
|
||||||
|
|
||||||
### The Solution
|
|
||||||
|
|
||||||
The system captures all chunks immediately before yielding any:
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def handle_streaming_record(response):
|
|
||||||
# Capture complete stream first
|
|
||||||
chunks = []
|
|
||||||
async for chunk in response:
|
|
||||||
chunks.append(chunk)
|
|
||||||
|
|
||||||
# Store complete recording
|
|
||||||
storage.store_recording(
|
|
||||||
request_hash, request_data, {"body": chunks, "is_streaming": True}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Return generator that replays captured chunks
|
|
||||||
async def replay_stream():
|
|
||||||
for chunk in chunks:
|
|
||||||
yield chunk
|
|
||||||
|
|
||||||
return replay_stream()
|
|
||||||
```
|
|
||||||
|
|
||||||
This ensures:
|
|
||||||
- **Complete capture** - The entire stream is saved atomically
|
|
||||||
- **Interface preservation** - The returned object behaves like the original API
|
|
||||||
- **Deterministic replay** - Same chunks in the same order every time
|
|
||||||
|
|
||||||
## Serialization
|
|
||||||
|
|
||||||
API responses contain complex Pydantic objects that need careful serialization:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def _serialize_response(response):
|
|
||||||
if hasattr(response, "model_dump"):
|
|
||||||
# Preserve type information for proper deserialization
|
|
||||||
return {
|
|
||||||
"__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
|
|
||||||
"__data__": response.model_dump(mode="json"),
|
|
||||||
}
|
|
||||||
return response
|
|
||||||
```
|
|
||||||
|
|
||||||
This preserves type safety - when replayed, you get the same Pydantic objects with all their validation and methods.
|
|
||||||
|
|
||||||
## Environment Integration
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
Control recording behavior globally:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export LLAMA_STACK_TEST_INFERENCE_MODE=replay
|
|
||||||
export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
|
|
||||||
pytest tests/integration/
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pytest Integration
|
|
||||||
|
|
||||||
The system integrates automatically based on environment variables, requiring no changes to test code.
|
|
||||||
|
|
||||||
## Debugging Recordings
|
|
||||||
|
|
||||||
### Inspecting Storage
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# See what's recorded
|
|
||||||
sqlite3 recordings/index.sqlite "SELECT endpoint, model, timestamp FROM recordings LIMIT 10;"
|
|
||||||
|
|
||||||
# View specific response
|
|
||||||
cat recordings/responses/abc123def456.json | jq '.response.body'
|
|
||||||
|
|
||||||
# Find recordings by endpoint
|
|
||||||
sqlite3 recordings/index.sqlite "SELECT * FROM recordings WHERE endpoint='/v1/chat/completions';"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Common Issues
|
|
||||||
|
|
||||||
**Hash mismatches:** Request parameters changed slightly between record and replay
|
|
||||||
```bash
|
|
||||||
# Compare request details
|
|
||||||
cat recordings/responses/abc123.json | jq '.request'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Serialization errors:** Response types changed between versions
|
|
||||||
```bash
|
|
||||||
# Re-record with updated types
|
|
||||||
rm recordings/responses/failing_hash.json
|
|
||||||
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_failing.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Missing recordings:** New test or changed parameters
|
|
||||||
```bash
|
|
||||||
# Record the missing interaction
|
|
||||||
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_new.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Design Decisions
|
|
||||||
|
|
||||||
### Why Not Mocks?
|
|
||||||
|
|
||||||
Traditional mocking breaks down with AI APIs because:
|
|
||||||
- Response structures are complex and evolve frequently
|
|
||||||
- Streaming behavior is hard to mock correctly
|
|
||||||
- Edge cases in real APIs get missed
|
|
||||||
- Mocks become brittle maintenance burdens
|
|
||||||
|
|
||||||
### Why Precise Hashing?
|
|
||||||
|
|
||||||
Loose hashing (normalizing whitespace, rounding floats) seems convenient but hides bugs. If a test changes slightly, you want to know about it rather than accidentally getting the wrong cached response.
|
|
||||||
|
|
||||||
### Why JSON + SQLite?
|
|
||||||
|
|
||||||
- **JSON** - Human readable, diff-friendly, easy to inspect and modify
|
|
||||||
- **SQLite** - Fast indexed lookups without loading response bodies
|
|
||||||
- **Hybrid** - Best of both worlds for different use cases
|
|
||||||
|
|
||||||
This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.
|
|
|
@ -53,31 +53,24 @@ The main points to consider are:
|
||||||
|
|
||||||
```
|
```
|
||||||
llama stack build -h
|
llama stack build -h
|
||||||
usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
|
usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]
|
||||||
[--run] [--providers PROVIDERS]
|
|
||||||
|
|
||||||
Build a Llama stack container
|
Build a Llama stack container
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
|
--config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will
|
||||||
enter information interactively (default: None)
|
be prompted to enter information interactively (default: None)
|
||||||
--template TEMPLATE (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
|
--template TEMPLATE Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
|
||||||
None)
|
--list-templates Show the available templates for building a Llama Stack distribution (default: False)
|
||||||
--distro DISTRIBUTION, --distribution DISTRIBUTION
|
|
||||||
Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
|
|
||||||
--list-distros, --list-distributions
|
|
||||||
Show the available distributions for building a Llama Stack distribution (default: False)
|
|
||||||
--image-type {container,venv}
|
--image-type {container,venv}
|
||||||
Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
|
Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
|
||||||
--image-name IMAGE_NAME
|
--image-name IMAGE_NAME
|
||||||
[for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
|
[for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if
|
||||||
None)
|
found. (default: None)
|
||||||
--print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
|
--print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
|
||||||
--run Run the stack after building using the same image type, name, and other applicable arguments (default: False)
|
--run Run the stack after building using the same image type, name, and other applicable arguments (default: False)
|
||||||
--providers PROVIDERS
|
|
||||||
Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
|
|
||||||
API. (default: None)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
|
After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
|
||||||
|
|
|
@ -1,156 +0,0 @@
|
||||||
# Llama Stack Benchmark Suite on Kubernetes
|
|
||||||
|
|
||||||
## Motivation
|
|
||||||
|
|
||||||
Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
|
|
||||||
|
|
||||||
### Why This Benchmark Suite Exists
|
|
||||||
|
|
||||||
**Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
|
|
||||||
- Llama Stack inference (with vLLM backend)
|
|
||||||
- Direct vLLM inference calls
|
|
||||||
- Both under identical Kubernetes deployment conditions
|
|
||||||
|
|
||||||
**Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
|
|
||||||
|
|
||||||
**Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
|
|
||||||
|
|
||||||
**Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
|
|
||||||
- Kubernetes resource allocation (CPU, memory, GPU)
|
|
||||||
- Auto-scaling configurations
|
|
||||||
- Cost optimization strategies
|
|
||||||
|
|
||||||
### Key Metrics Captured
|
|
||||||
|
|
||||||
The benchmark suite measures critical performance indicators:
|
|
||||||
- **Throughput**: Requests per second under sustained load
|
|
||||||
- **Latency Distribution**: P50, P95, P99 response times
|
|
||||||
- **Time to First Token (TTFT)**: Critical for streaming applications
|
|
||||||
- **Error Rates**: Request failures and timeout analysis
|
|
||||||
|
|
||||||
This data enables data-driven architectural decisions and performance optimization efforts.
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
**1. Deploy base k8s infrastructure:**
|
|
||||||
```bash
|
|
||||||
cd ../k8s
|
|
||||||
./apply.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
**2. Deploy benchmark components:**
|
|
||||||
```bash
|
|
||||||
cd ../k8s-benchmark
|
|
||||||
./apply.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
**3. Verify deployment:**
|
|
||||||
```bash
|
|
||||||
kubectl get pods
|
|
||||||
# Should see: llama-stack-benchmark-server, vllm-server, etc.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### Basic Benchmarks
|
|
||||||
|
|
||||||
**Benchmark Llama Stack (default):**
|
|
||||||
```bash
|
|
||||||
cd docs/source/distributions/k8s-benchmark/
|
|
||||||
./run-benchmark.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
**Benchmark vLLM direct:**
|
|
||||||
```bash
|
|
||||||
./run-benchmark.sh --target vllm
|
|
||||||
```
|
|
||||||
|
|
||||||
### Custom Configuration
|
|
||||||
|
|
||||||
**Extended benchmark with high concurrency:**
|
|
||||||
```bash
|
|
||||||
./run-benchmark.sh --target vllm --duration 120 --concurrent 20
|
|
||||||
```
|
|
||||||
|
|
||||||
**Short test run:**
|
|
||||||
```bash
|
|
||||||
./run-benchmark.sh --target stack --duration 30 --concurrent 5
|
|
||||||
```
|
|
||||||
|
|
||||||
## Command Reference
|
|
||||||
|
|
||||||
### run-benchmark.sh Options
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./run-benchmark.sh [options]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
-t, --target <stack|vllm> Target to benchmark (default: stack)
|
|
||||||
-d, --duration <seconds> Duration in seconds (default: 60)
|
|
||||||
-c, --concurrent <users> Number of concurrent users (default: 10)
|
|
||||||
-h, --help Show help message
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
./run-benchmark.sh --target vllm # Benchmark vLLM direct
|
|
||||||
./run-benchmark.sh --target stack # Benchmark Llama Stack
|
|
||||||
./run-benchmark.sh -t vllm -d 120 -c 20 # vLLM with 120s, 20 users
|
|
||||||
```
|
|
||||||
|
|
||||||
## Local Testing
|
|
||||||
|
|
||||||
### Running Benchmark Locally
|
|
||||||
|
|
||||||
For local development without Kubernetes:
|
|
||||||
|
|
||||||
**1. Start OpenAI mock server:**
|
|
||||||
```bash
|
|
||||||
uv run python openai-mock-server.py --port 8080
|
|
||||||
```
|
|
||||||
|
|
||||||
**2. Run benchmark against mock server:**
|
|
||||||
```bash
|
|
||||||
uv run python benchmark.py \
|
|
||||||
--base-url http://localhost:8080/v1 \
|
|
||||||
--model mock-inference \
|
|
||||||
--duration 30 \
|
|
||||||
--concurrent 5
|
|
||||||
```
|
|
||||||
|
|
||||||
**3. Test against local vLLM server:**
|
|
||||||
```bash
|
|
||||||
# If you have vLLM running locally on port 8000
|
|
||||||
uv run python benchmark.py \
|
|
||||||
--base-url http://localhost:8000/v1 \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--duration 30 \
|
|
||||||
--concurrent 5
|
|
||||||
```
|
|
||||||
|
|
||||||
**4. Profile the running server:**
|
|
||||||
```bash
|
|
||||||
./profile_running_server.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### OpenAI Mock Server
|
|
||||||
|
|
||||||
The `openai-mock-server.py` provides:
|
|
||||||
- **OpenAI-compatible API** for testing without real models
|
|
||||||
- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
|
|
||||||
- **Consistent responses** for reproducible benchmarks
|
|
||||||
- **Lightweight testing** without GPU requirements
|
|
||||||
|
|
||||||
**Mock server usage:**
|
|
||||||
```bash
|
|
||||||
uv run python openai-mock-server.py --port 8080
|
|
||||||
```
|
|
||||||
|
|
||||||
The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
|
|
||||||
|
|
||||||
## Files in this Directory
|
|
||||||
|
|
||||||
- `benchmark.py` - Core benchmark script with async streaming support
|
|
||||||
- `run-benchmark.sh` - Main script with target selection and configuration
|
|
||||||
- `openai-mock-server.py` - Mock OpenAI API server for local testing
|
|
||||||
- `README.md` - This documentation file
|
|
|
@ -1,36 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
|
|
||||||
|
|
||||||
export STREAM_DELAY_SECONDS=0.005
|
|
||||||
|
|
||||||
export POSTGRES_USER=llamastack
|
|
||||||
export POSTGRES_DB=llamastack
|
|
||||||
export POSTGRES_PASSWORD=llamastack
|
|
||||||
|
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
|
||||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
|
||||||
|
|
||||||
export MOCK_INFERENCE_MODEL=mock-inference
|
|
||||||
|
|
||||||
export MOCK_INFERENCE_URL=openai-mock-service:8080
|
|
||||||
|
|
||||||
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
set -x
|
|
||||||
|
|
||||||
# Deploy benchmark-specific components
|
|
||||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
|
||||||
--dry-run=client -o yaml > stack-configmap.yaml
|
|
||||||
|
|
||||||
kubectl apply --validate=false -f stack-configmap.yaml
|
|
||||||
|
|
||||||
# Deploy our custom llama stack server (overriding the base one)
|
|
||||||
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
|
|
|
@ -1,267 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Simple benchmark script for Llama Stack with OpenAI API compatibility.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import statistics
|
|
||||||
import time
|
|
||||||
from typing import Tuple
|
|
||||||
import aiohttp
|
|
||||||
|
|
||||||
|
|
||||||
class BenchmarkStats:
|
|
||||||
def __init__(self):
|
|
||||||
self.response_times = []
|
|
||||||
self.ttft_times = []
|
|
||||||
self.chunks_received = []
|
|
||||||
self.errors = []
|
|
||||||
self.success_count = 0
|
|
||||||
self.total_requests = 0
|
|
||||||
self.concurrent_users = 0
|
|
||||||
self.start_time = None
|
|
||||||
self.end_time = None
|
|
||||||
self._lock = asyncio.Lock()
|
|
||||||
|
|
||||||
async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
|
|
||||||
async with self._lock:
|
|
||||||
self.total_requests += 1
|
|
||||||
if error:
|
|
||||||
self.errors.append(error)
|
|
||||||
else:
|
|
||||||
self.success_count += 1
|
|
||||||
self.response_times.append(response_time)
|
|
||||||
self.chunks_received.append(chunks)
|
|
||||||
if ttft is not None:
|
|
||||||
self.ttft_times.append(ttft)
|
|
||||||
|
|
||||||
def print_summary(self):
|
|
||||||
if not self.response_times:
|
|
||||||
print("No successful requests to report")
|
|
||||||
if self.errors:
|
|
||||||
print(f"Total errors: {len(self.errors)}")
|
|
||||||
print("First 5 errors:")
|
|
||||||
for error in self.errors[:5]:
|
|
||||||
print(f" {error}")
|
|
||||||
return
|
|
||||||
|
|
||||||
total_time = self.end_time - self.start_time
|
|
||||||
success_rate = (self.success_count / self.total_requests) * 100
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"BENCHMARK RESULTS")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Total time: {total_time:.2f}s")
|
|
||||||
print(f"Concurrent users: {self.concurrent_users}")
|
|
||||||
print(f"Total requests: {self.total_requests}")
|
|
||||||
print(f"Successful requests: {self.success_count}")
|
|
||||||
print(f"Failed requests: {len(self.errors)}")
|
|
||||||
print(f"Success rate: {success_rate:.1f}%")
|
|
||||||
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
|
||||||
|
|
||||||
print(f"\nResponse Time Statistics:")
|
|
||||||
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
|
|
||||||
print(f" Median: {statistics.median(self.response_times):.3f}s")
|
|
||||||
print(f" Min: {min(self.response_times):.3f}s")
|
|
||||||
print(f" Max: {max(self.response_times):.3f}s")
|
|
||||||
|
|
||||||
if len(self.response_times) > 1:
|
|
||||||
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
|
|
||||||
|
|
||||||
percentiles = [50, 90, 95, 99]
|
|
||||||
sorted_times = sorted(self.response_times)
|
|
||||||
print(f"\nPercentiles:")
|
|
||||||
for p in percentiles:
|
|
||||||
idx = int(len(sorted_times) * p / 100) - 1
|
|
||||||
idx = max(0, min(idx, len(sorted_times) - 1))
|
|
||||||
print(f" P{p}: {sorted_times[idx]:.3f}s")
|
|
||||||
|
|
||||||
if self.ttft_times:
|
|
||||||
print(f"\nTime to First Token (TTFT) Statistics:")
|
|
||||||
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
|
|
||||||
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
|
|
||||||
print(f" Min: {min(self.ttft_times):.3f}s")
|
|
||||||
print(f" Max: {max(self.ttft_times):.3f}s")
|
|
||||||
|
|
||||||
if len(self.ttft_times) > 1:
|
|
||||||
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
|
|
||||||
|
|
||||||
sorted_ttft = sorted(self.ttft_times)
|
|
||||||
print(f"\nTTFT Percentiles:")
|
|
||||||
for p in percentiles:
|
|
||||||
idx = int(len(sorted_ttft) * p / 100) - 1
|
|
||||||
idx = max(0, min(idx, len(sorted_ttft) - 1))
|
|
||||||
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
|
|
||||||
|
|
||||||
if self.chunks_received:
|
|
||||||
print(f"\nStreaming Statistics:")
|
|
||||||
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
|
|
||||||
print(f" Total chunks received: {sum(self.chunks_received)}")
|
|
||||||
|
|
||||||
if self.errors:
|
|
||||||
print(f"\nErrors (showing first 5):")
|
|
||||||
for error in self.errors[:5]:
|
|
||||||
print(f" {error}")
|
|
||||||
|
|
||||||
|
|
||||||
class LlamaStackBenchmark:
|
|
||||||
def __init__(self, base_url: str, model_id: str):
|
|
||||||
self.base_url = base_url.rstrip('/')
|
|
||||||
self.model_id = model_id
|
|
||||||
self.headers = {"Content-Type": "application/json"}
|
|
||||||
self.test_messages = [
|
|
||||||
[{"role": "user", "content": "Hi"}],
|
|
||||||
[{"role": "user", "content": "What is the capital of France?"}],
|
|
||||||
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
|
||||||
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
|
||||||
[
|
|
||||||
{"role": "user", "content": "What is machine learning?"},
|
|
||||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
|
||||||
{"role": "user", "content": "Can you give me a practical example?"}
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
|
|
||||||
"""Make a single async streaming chat completion request."""
|
|
||||||
messages = random.choice(self.test_messages)
|
|
||||||
payload = {
|
|
||||||
"model": self.model_id,
|
|
||||||
"messages": messages,
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 100
|
|
||||||
}
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
chunks_received = 0
|
|
||||||
ttft = None
|
|
||||||
error = None
|
|
||||||
|
|
||||||
session = aiohttp.ClientSession()
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with session.post(
|
|
||||||
f"{self.base_url}/chat/completions",
|
|
||||||
headers=self.headers,
|
|
||||||
json=payload,
|
|
||||||
timeout=aiohttp.ClientTimeout(total=30)
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
async for line in response.content:
|
|
||||||
if line:
|
|
||||||
line_str = line.decode('utf-8').strip()
|
|
||||||
if line_str.startswith('data: '):
|
|
||||||
chunks_received += 1
|
|
||||||
if ttft is None:
|
|
||||||
ttft = time.time() - start_time
|
|
||||||
if line_str == 'data: [DONE]':
|
|
||||||
break
|
|
||||||
|
|
||||||
if chunks_received == 0:
|
|
||||||
error = "No streaming chunks received"
|
|
||||||
else:
|
|
||||||
text = await response.text()
|
|
||||||
error = f"HTTP {response.status}: {text[:100]}"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
error = f"Request error: {str(e)}"
|
|
||||||
finally:
|
|
||||||
await session.close()
|
|
||||||
|
|
||||||
response_time = time.time() - start_time
|
|
||||||
return response_time, chunks_received, ttft, error
|
|
||||||
|
|
||||||
|
|
||||||
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
|
|
||||||
"""Run benchmark using async requests for specified duration."""
|
|
||||||
stats = BenchmarkStats()
|
|
||||||
stats.concurrent_users = concurrent_users
|
|
||||||
stats.start_time = time.time()
|
|
||||||
|
|
||||||
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
|
|
||||||
print(f"Target URL: {self.base_url}/chat/completions")
|
|
||||||
print(f"Model: {self.model_id}")
|
|
||||||
|
|
||||||
connector = aiohttp.TCPConnector(limit=concurrent_users)
|
|
||||||
async with aiohttp.ClientSession(connector=connector) as session:
|
|
||||||
|
|
||||||
async def worker(worker_id: int):
|
|
||||||
"""Worker that sends requests sequentially until canceled."""
|
|
||||||
request_count = 0
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
response_time, chunks, ttft, error = await self.make_async_streaming_request()
|
|
||||||
await stats.add_result(response_time, chunks, ttft, error)
|
|
||||||
request_count += 1
|
|
||||||
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
|
|
||||||
|
|
||||||
# Progress reporting task
|
|
||||||
async def progress_reporter():
|
|
||||||
last_report_time = time.time()
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
await asyncio.sleep(1) # Report every second
|
|
||||||
if time.time() >= last_report_time + 10: # Report every 10 seconds
|
|
||||||
elapsed = time.time() - stats.start_time
|
|
||||||
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
|
|
||||||
last_report_time = time.time()
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Spawn concurrent workers
|
|
||||||
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
|
|
||||||
progress_task = asyncio.create_task(progress_reporter())
|
|
||||||
tasks.append(progress_task)
|
|
||||||
|
|
||||||
# Wait for duration then cancel all tasks
|
|
||||||
await asyncio.sleep(duration)
|
|
||||||
|
|
||||||
for task in tasks:
|
|
||||||
task.cancel()
|
|
||||||
|
|
||||||
# Wait for all tasks to complete
|
|
||||||
await asyncio.gather(*tasks, return_exceptions=True)
|
|
||||||
|
|
||||||
stats.end_time = time.time()
|
|
||||||
return stats
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
|
|
||||||
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
|
|
||||||
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
|
|
||||||
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
|
|
||||||
help="Model ID to use for requests")
|
|
||||||
parser.add_argument("--duration", type=int, default=60,
|
|
||||||
help="Duration in seconds to run benchmark (default: 60)")
|
|
||||||
parser.add_argument("--concurrent", type=int, default=10,
|
|
||||||
help="Number of concurrent users (default: 10)")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
benchmark = LlamaStackBenchmark(args.base_url, args.model)
|
|
||||||
|
|
||||||
try:
|
|
||||||
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
|
|
||||||
stats.print_summary()
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\nBenchmark interrupted by user")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Benchmark failed: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,190 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
"""
|
|
||||||
OpenAI-compatible mock server that returns:
|
|
||||||
- Hardcoded /models response for consistent validation
|
|
||||||
- Valid OpenAI-formatted chat completion responses with dynamic content
|
|
||||||
"""
|
|
||||||
|
|
||||||
from flask import Flask, request, jsonify, Response
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
import uuid
|
|
||||||
import json
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
# Models from environment variables
|
|
||||||
def get_models():
|
|
||||||
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
|
||||||
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"object": "list",
|
|
||||||
"data": [
|
|
||||||
{
|
|
||||||
"id": model_id,
|
|
||||||
"object": "model",
|
|
||||||
"created": 1234567890,
|
|
||||||
"owned_by": "vllm"
|
|
||||||
}
|
|
||||||
for model_id in model_ids
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
def generate_random_text(length=50):
|
|
||||||
"""Generate random but coherent text for responses."""
|
|
||||||
words = [
|
|
||||||
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
|
|
||||||
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
|
|
||||||
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
|
|
||||||
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
|
|
||||||
]
|
|
||||||
return " ".join(random.choices(words, k=length))
|
|
||||||
|
|
||||||
@app.route('/v1/models', methods=['GET'])
|
|
||||||
def list_models():
|
|
||||||
models = get_models()
|
|
||||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
|
||||||
return jsonify(models)
|
|
||||||
|
|
||||||
@app.route('/v1/chat/completions', methods=['POST'])
|
|
||||||
def chat_completions():
|
|
||||||
"""Return OpenAI-formatted chat completion responses."""
|
|
||||||
data = request.get_json()
|
|
||||||
default_model = get_models()['data'][0]['id']
|
|
||||||
model = data.get('model', default_model)
|
|
||||||
messages = data.get('messages', [])
|
|
||||||
stream = data.get('stream', False)
|
|
||||||
|
|
||||||
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
|
||||||
|
|
||||||
if stream:
|
|
||||||
return handle_streaming_completion(model, messages)
|
|
||||||
else:
|
|
||||||
return handle_non_streaming_completion(model, messages)
|
|
||||||
|
|
||||||
def handle_non_streaming_completion(model, messages):
|
|
||||||
response_text = generate_random_text(random.randint(20, 80))
|
|
||||||
|
|
||||||
# Calculate realistic token counts
|
|
||||||
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
|
|
||||||
completion_tokens = len(response_text.split())
|
|
||||||
|
|
||||||
response = {
|
|
||||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
||||||
"object": "chat.completion",
|
|
||||||
"created": int(time.time()),
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"message": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": response_text
|
|
||||||
},
|
|
||||||
"finish_reason": "stop"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": prompt_tokens,
|
|
||||||
"completion_tokens": completion_tokens,
|
|
||||||
"total_tokens": prompt_tokens + completion_tokens
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return jsonify(response)
|
|
||||||
|
|
||||||
def handle_streaming_completion(model, messages):
|
|
||||||
def generate_stream():
|
|
||||||
# Generate response text
|
|
||||||
full_response = generate_random_text(random.randint(30, 100))
|
|
||||||
words = full_response.split()
|
|
||||||
|
|
||||||
# Send initial chunk
|
|
||||||
initial_chunk = {
|
|
||||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": int(time.time()),
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"delta": {"role": "assistant", "content": ""}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
|
||||||
|
|
||||||
# Send word by word
|
|
||||||
for i, word in enumerate(words):
|
|
||||||
chunk = {
|
|
||||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": int(time.time()),
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
yield f"data: {json.dumps(chunk)}\n\n"
|
|
||||||
# Configurable delay to simulate realistic streaming
|
|
||||||
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
|
|
||||||
time.sleep(stream_delay)
|
|
||||||
|
|
||||||
# Send final chunk
|
|
||||||
final_chunk = {
|
|
||||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": int(time.time()),
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"delta": {"content": ""},
|
|
||||||
"finish_reason": "stop"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
yield f"data: {json.dumps(final_chunk)}\n\n"
|
|
||||||
yield "data: [DONE]\n\n"
|
|
||||||
|
|
||||||
return Response(
|
|
||||||
generate_stream(),
|
|
||||||
mimetype='text/event-stream',
|
|
||||||
headers={
|
|
||||||
'Cache-Control': 'no-cache',
|
|
||||||
'Connection': 'keep-alive',
|
|
||||||
'Access-Control-Allow-Origin': '*',
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
@app.route('/health', methods=['GET'])
|
|
||||||
def health():
|
|
||||||
return jsonify({"status": "healthy", "type": "openai-mock"})
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
|
|
||||||
parser.add_argument('--port', type=int, default=8081,
|
|
||||||
help='Port to run the server on (default: 8081)')
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
port = args.port
|
|
||||||
|
|
||||||
models = get_models()
|
|
||||||
print("Starting OpenAI-compatible mock server...")
|
|
||||||
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
|
|
||||||
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
|
||||||
print("- Streaming support with valid SSE format")
|
|
||||||
print(f"- Listening on: http://0.0.0.0:{port}")
|
|
||||||
app.run(host='0.0.0.0', port=port, debug=False)
|
|
|
@ -1,52 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
# Script to profile an already running Llama Stack server
|
|
||||||
# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
|
|
||||||
|
|
||||||
DURATION=${1:-60} # Default 60 seconds
|
|
||||||
OUTPUT_FILE=${2:-"llama_stack_profile"} # Default output file
|
|
||||||
|
|
||||||
echo "Looking for running Llama Stack server..."
|
|
||||||
|
|
||||||
# Find the server PID
|
|
||||||
SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
|
|
||||||
|
|
||||||
|
|
||||||
if [ -z "$SERVER_PID" ]; then
|
|
||||||
echo "Error: No running Llama Stack server found"
|
|
||||||
echo "Please start your server first with:"
|
|
||||||
echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Found Llama Stack server with PID: $SERVER_PID"
|
|
||||||
|
|
||||||
# Start py-spy profiling
|
|
||||||
echo "Starting py-spy profiling for ${DURATION} seconds..."
|
|
||||||
echo "Output will be saved to: ${OUTPUT_FILE}.svg"
|
|
||||||
echo ""
|
|
||||||
echo "You can now run your load test..."
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Get the full path to py-spy
|
|
||||||
PYSPY_PATH=$(which py-spy)
|
|
||||||
|
|
||||||
# Check if running as root, if not, use sudo
|
|
||||||
if [ "$EUID" -ne 0 ]; then
|
|
||||||
echo "py-spy requires root permissions on macOS. Running with sudo..."
|
|
||||||
sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
|
||||||
else
|
|
||||||
"$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
|
|
||||||
echo ""
|
|
||||||
echo "To view the flame graph:"
|
|
||||||
echo "open ${OUTPUT_FILE}.svg"
|
|
|
@ -1,148 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Default values
|
|
||||||
TARGET="stack"
|
|
||||||
DURATION=60
|
|
||||||
CONCURRENT=10
|
|
||||||
|
|
||||||
# Parse command line arguments
|
|
||||||
usage() {
|
|
||||||
echo "Usage: $0 [options]"
|
|
||||||
echo "Options:"
|
|
||||||
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
|
|
||||||
echo " -d, --duration <seconds> Duration in seconds (default: 60)"
|
|
||||||
echo " -c, --concurrent <users> Number of concurrent users (default: 10)"
|
|
||||||
echo " -h, --help Show this help message"
|
|
||||||
echo ""
|
|
||||||
echo "Examples:"
|
|
||||||
echo " $0 --target vllm # Benchmark vLLM direct"
|
|
||||||
echo " $0 --target stack # Benchmark Llama Stack (default)"
|
|
||||||
echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users"
|
|
||||||
}
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case $1 in
|
|
||||||
-t|--target)
|
|
||||||
TARGET="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
-d|--duration)
|
|
||||||
DURATION="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
-c|--concurrent)
|
|
||||||
CONCURRENT="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1"
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# Validate target
|
|
||||||
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
|
|
||||||
echo "Error: Target must be 'stack' or 'vllm'"
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Set configuration based on target
|
|
||||||
if [[ "$TARGET" == "vllm" ]]; then
|
|
||||||
BASE_URL="http://vllm-server:8000/v1"
|
|
||||||
JOB_NAME="vllm-benchmark-job"
|
|
||||||
echo "Benchmarking vLLM direct..."
|
|
||||||
else
|
|
||||||
BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
|
|
||||||
JOB_NAME="stack-benchmark-job"
|
|
||||||
echo "Benchmarking Llama Stack..."
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Configuration:"
|
|
||||||
echo " Target: $TARGET"
|
|
||||||
echo " Base URL: $BASE_URL"
|
|
||||||
echo " Duration: ${DURATION}s"
|
|
||||||
echo " Concurrent users: $CONCURRENT"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Create temporary job yaml
|
|
||||||
TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
|
|
||||||
cat > "$TEMP_YAML" << EOF
|
|
||||||
apiVersion: batch/v1
|
|
||||||
kind: Job
|
|
||||||
metadata:
|
|
||||||
name: $JOB_NAME
|
|
||||||
namespace: default
|
|
||||||
spec:
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: benchmark
|
|
||||||
image: python:3.11-slim
|
|
||||||
command: ["/bin/bash"]
|
|
||||||
args:
|
|
||||||
- "-c"
|
|
||||||
- |
|
|
||||||
pip install aiohttp &&
|
|
||||||
python3 /benchmark/benchmark.py \\
|
|
||||||
--base-url $BASE_URL \\
|
|
||||||
--model \${INFERENCE_MODEL} \\
|
|
||||||
--duration $DURATION \\
|
|
||||||
--concurrent $CONCURRENT
|
|
||||||
env:
|
|
||||||
- name: INFERENCE_MODEL
|
|
||||||
value: "meta-llama/Llama-3.2-3B-Instruct"
|
|
||||||
volumeMounts:
|
|
||||||
- name: benchmark-script
|
|
||||||
mountPath: /benchmark
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
memory: "256Mi"
|
|
||||||
cpu: "250m"
|
|
||||||
limits:
|
|
||||||
memory: "512Mi"
|
|
||||||
cpu: "500m"
|
|
||||||
volumes:
|
|
||||||
- name: benchmark-script
|
|
||||||
configMap:
|
|
||||||
name: benchmark-script
|
|
||||||
restartPolicy: Never
|
|
||||||
backoffLimit: 3
|
|
||||||
EOF
|
|
||||||
|
|
||||||
echo "Creating benchmark ConfigMap..."
|
|
||||||
kubectl create configmap benchmark-script \
|
|
||||||
--from-file=benchmark.py=benchmark.py \
|
|
||||||
--dry-run=client -o yaml | kubectl apply -f -
|
|
||||||
|
|
||||||
echo "Cleaning up any existing benchmark job..."
|
|
||||||
kubectl delete job $JOB_NAME 2>/dev/null || true
|
|
||||||
|
|
||||||
echo "Deploying benchmark Job..."
|
|
||||||
kubectl apply -f "$TEMP_YAML"
|
|
||||||
|
|
||||||
echo "Waiting for job to start..."
|
|
||||||
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
|
|
||||||
|
|
||||||
echo "Following benchmark logs..."
|
|
||||||
kubectl logs -f job/$JOB_NAME
|
|
||||||
|
|
||||||
echo "Job completed. Checking final status..."
|
|
||||||
kubectl get job $JOB_NAME
|
|
||||||
|
|
||||||
# Clean up temporary file
|
|
||||||
rm -f "$TEMP_YAML"
|
|
|
@ -1,133 +0,0 @@
|
||||||
apiVersion: v1
|
|
||||||
data:
|
|
||||||
stack_run_config.yaml: |
|
|
||||||
version: '2'
|
|
||||||
image_name: kubernetes-benchmark-demo
|
|
||||||
apis:
|
|
||||||
- agents
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
- telemetry
|
|
||||||
- tool_runtime
|
|
||||||
- vector_io
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: vllm-inference
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: vllm-safety
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: sentence-transformers
|
|
||||||
provider_type: inline::sentence-transformers
|
|
||||||
config: {}
|
|
||||||
vector_io:
|
|
||||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
|
||||||
provider_type: remote::chromadb
|
|
||||||
config:
|
|
||||||
url: ${env.CHROMADB_URL:=}
|
|
||||||
kvstore:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
safety:
|
|
||||||
- provider_id: llama-guard
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
excluded_categories: []
|
|
||||||
agents:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
responses_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
|
||||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
|
||||||
tool_runtime:
|
|
||||||
- provider_id: brave-search
|
|
||||||
provider_type: remote::brave-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: tavily-search
|
|
||||||
provider_type: remote::tavily-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: rag-runtime
|
|
||||||
provider_type: inline::rag-runtime
|
|
||||||
config: {}
|
|
||||||
- provider_id: model-context-protocol
|
|
||||||
provider_type: remote::model-context-protocol
|
|
||||||
config: {}
|
|
||||||
metadata_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
table_name: llamastack_kvstore
|
|
||||||
inference_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
models:
|
|
||||||
- metadata:
|
|
||||||
embedding_dimension: 384
|
|
||||||
model_id: all-MiniLM-L6-v2
|
|
||||||
provider_id: sentence-transformers
|
|
||||||
model_type: embedding
|
|
||||||
- model_id: ${env.INFERENCE_MODEL}
|
|
||||||
provider_id: vllm-inference
|
|
||||||
model_type: llm
|
|
||||||
- model_id: ${env.SAFETY_MODEL}
|
|
||||||
provider_id: vllm-safety
|
|
||||||
model_type: llm
|
|
||||||
shields:
|
|
||||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
|
||||||
vector_dbs: []
|
|
||||||
datasets: []
|
|
||||||
scoring_fns: []
|
|
||||||
benchmarks: []
|
|
||||||
tool_groups:
|
|
||||||
- toolgroup_id: builtin::websearch
|
|
||||||
provider_id: tavily-search
|
|
||||||
- toolgroup_id: builtin::rag
|
|
||||||
provider_id: rag-runtime
|
|
||||||
server:
|
|
||||||
port: 8323
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
creationTimestamp: null
|
|
||||||
name: llama-stack-config
|
|
|
@ -1,83 +0,0 @@
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: llama-benchmark-pvc
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 1Gi
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: llama-stack-benchmark-server
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: llama-stack-benchmark
|
|
||||||
app.kubernetes.io/component: server
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: llama-stack-benchmark
|
|
||||||
app.kubernetes.io/component: server
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: llama-stack-benchmark
|
|
||||||
image: llamastack/distribution-starter:latest
|
|
||||||
imagePullPolicy: Always # since we have specified latest instead of a version
|
|
||||||
env:
|
|
||||||
- name: ENABLE_CHROMADB
|
|
||||||
value: "true"
|
|
||||||
- name: CHROMADB_URL
|
|
||||||
value: http://chromadb.default.svc.cluster.local:6000
|
|
||||||
- name: POSTGRES_HOST
|
|
||||||
value: postgres-server.default.svc.cluster.local
|
|
||||||
- name: POSTGRES_PORT
|
|
||||||
value: "5432"
|
|
||||||
- name: INFERENCE_MODEL
|
|
||||||
value: "${INFERENCE_MODEL}"
|
|
||||||
- name: SAFETY_MODEL
|
|
||||||
value: "${SAFETY_MODEL}"
|
|
||||||
- name: TAVILY_SEARCH_API_KEY
|
|
||||||
value: "${TAVILY_SEARCH_API_KEY}"
|
|
||||||
- name: VLLM_URL
|
|
||||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
|
||||||
- name: VLLM_MAX_TOKENS
|
|
||||||
value: "3072"
|
|
||||||
- name: VLLM_SAFETY_URL
|
|
||||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
|
||||||
- name: VLLM_TLS_VERIFY
|
|
||||||
value: "false"
|
|
||||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
|
||||||
ports:
|
|
||||||
- containerPort: 8323
|
|
||||||
volumeMounts:
|
|
||||||
- name: llama-storage
|
|
||||||
mountPath: /root/.llama
|
|
||||||
- name: llama-config
|
|
||||||
mountPath: /etc/config
|
|
||||||
volumes:
|
|
||||||
- name: llama-storage
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: llama-benchmark-pvc
|
|
||||||
- name: llama-config
|
|
||||||
configMap:
|
|
||||||
name: llama-stack-config
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: llama-stack-benchmark-service
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app.kubernetes.io/name: llama-stack-benchmark
|
|
||||||
app.kubernetes.io/component: server
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 8323
|
|
||||||
targetPort: 8323
|
|
||||||
type: ClusterIP
|
|
|
@ -1,108 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: kubernetes-benchmark-demo
|
|
||||||
apis:
|
|
||||||
- agents
|
|
||||||
- inference
|
|
||||||
- telemetry
|
|
||||||
- tool_runtime
|
|
||||||
- vector_io
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: vllm-inference
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: sentence-transformers
|
|
||||||
provider_type: inline::sentence-transformers
|
|
||||||
config: {}
|
|
||||||
vector_io:
|
|
||||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
|
||||||
provider_type: remote::chromadb
|
|
||||||
config:
|
|
||||||
url: ${env.CHROMADB_URL:=}
|
|
||||||
kvstore:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
agents:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
responses_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
|
||||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
|
||||||
tool_runtime:
|
|
||||||
- provider_id: brave-search
|
|
||||||
provider_type: remote::brave-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: tavily-search
|
|
||||||
provider_type: remote::tavily-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: rag-runtime
|
|
||||||
provider_type: inline::rag-runtime
|
|
||||||
config: {}
|
|
||||||
- provider_id: model-context-protocol
|
|
||||||
provider_type: remote::model-context-protocol
|
|
||||||
config: {}
|
|
||||||
metadata_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
table_name: llamastack_kvstore
|
|
||||||
inference_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
models:
|
|
||||||
- metadata:
|
|
||||||
embedding_dimension: 384
|
|
||||||
model_id: all-MiniLM-L6-v2
|
|
||||||
provider_id: sentence-transformers
|
|
||||||
model_type: embedding
|
|
||||||
- model_id: ${env.INFERENCE_MODEL}
|
|
||||||
provider_id: vllm-inference
|
|
||||||
model_type: llm
|
|
||||||
vector_dbs: []
|
|
||||||
datasets: []
|
|
||||||
scoring_fns: []
|
|
||||||
benchmarks: []
|
|
||||||
tool_groups:
|
|
||||||
- toolgroup_id: builtin::websearch
|
|
||||||
provider_id: tavily-search
|
|
||||||
- toolgroup_id: builtin::rag
|
|
||||||
provider_id: rag-runtime
|
|
||||||
server:
|
|
||||||
port: 8323
|
|
|
@ -40,19 +40,19 @@ spec:
|
||||||
value: "3072"
|
value: "3072"
|
||||||
- name: VLLM_SAFETY_URL
|
- name: VLLM_SAFETY_URL
|
||||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||||
- name: VLLM_TLS_VERIFY
|
|
||||||
value: "false"
|
|
||||||
- name: POSTGRES_HOST
|
- name: POSTGRES_HOST
|
||||||
value: postgres-server.default.svc.cluster.local
|
value: postgres-server.default.svc.cluster.local
|
||||||
- name: POSTGRES_PORT
|
- name: POSTGRES_PORT
|
||||||
value: "5432"
|
value: "5432"
|
||||||
|
- name: VLLM_TLS_VERIFY
|
||||||
|
value: "false"
|
||||||
- name: INFERENCE_MODEL
|
- name: INFERENCE_MODEL
|
||||||
value: "${INFERENCE_MODEL}"
|
value: "${INFERENCE_MODEL}"
|
||||||
- name: SAFETY_MODEL
|
- name: SAFETY_MODEL
|
||||||
value: "${SAFETY_MODEL}"
|
value: "${SAFETY_MODEL}"
|
||||||
- name: TAVILY_SEARCH_API_KEY
|
- name: TAVILY_SEARCH_API_KEY
|
||||||
value: "${TAVILY_SEARCH_API_KEY}"
|
value: "${TAVILY_SEARCH_API_KEY}"
|
||||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
|
command: ["python", "-m", "llama_stack.core.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8321
|
- containerPort: 8321
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
|
|
|
@ -56,12 +56,12 @@ Breaking down the demo app, this section will show the core pieces that are used
|
||||||
### Setup Remote Inferencing
|
### Setup Remote Inferencing
|
||||||
Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
|
Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
|
||||||
```
|
```
|
||||||
uv venv starter --python 3.12
|
python -m venv stack-fireworks
|
||||||
source starter/bin/activate # On Windows: starter\Scripts\activate
|
source stack-fireworks/bin/activate # On Windows: stack-fireworks\Scripts\activate
|
||||||
pip install --no-cache llama-stack==0.2.2
|
pip install --no-cache llama-stack==0.2.2
|
||||||
llama stack build --distro starter --image-type venv
|
llama stack build --distro fireworks --image-type venv
|
||||||
export FIREWORKS_API_KEY=<SOME_KEY>
|
export FIREWORKS_API_KEY=<SOME_KEY>
|
||||||
llama stack run starter --port 5050
|
llama stack run fireworks --port 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.
|
Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.
|
||||||
|
|
|
@ -157,7 +157,7 @@ docker run \
|
||||||
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||||
llama stack build --distro nvidia --image-type venv
|
llama stack build --distro nvidia --image-type venv
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 8321 \
|
--port 8321 \
|
||||||
|
|
|
@ -52,16 +52,11 @@ agent = Agent(
|
||||||
prompt = "How do you do great work?"
|
prompt = "How do you do great work?"
|
||||||
print("prompt>", prompt)
|
print("prompt>", prompt)
|
||||||
|
|
||||||
use_stream = True
|
|
||||||
response = agent.create_turn(
|
response = agent.create_turn(
|
||||||
messages=[{"role": "user", "content": prompt}],
|
messages=[{"role": "user", "content": prompt}],
|
||||||
session_id=agent.create_session("rag_session"),
|
session_id=agent.create_session("rag_session"),
|
||||||
stream=use_stream,
|
stream=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Only call `AgentEventLogger().log(response)` for streaming responses.
|
for log in AgentEventLogger().log(response):
|
||||||
if use_stream:
|
log.print()
|
||||||
for log in AgentEventLogger().log(response):
|
|
||||||
log.print()
|
|
||||||
else:
|
|
||||||
print(response)
|
|
||||||
|
|
|
@ -150,7 +150,13 @@ pip install llama-stack-client
|
||||||
```
|
```
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
:::{tab-item} Install with `venv`
|
||||||
|
```bash
|
||||||
|
python -m venv stack-client
|
||||||
|
source stack-client/bin/activate # On Windows: stack-client\Scripts\activate
|
||||||
|
pip install llama-stack-client
|
||||||
|
```
|
||||||
|
:::
|
||||||
::::
|
::::
|
||||||
|
|
||||||
Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference.md) to check the
|
Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference.md) to check the
|
||||||
|
|
|
@ -2,15 +2,6 @@
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
Agents API for creating and interacting with agentic systems.
|
|
||||||
|
|
||||||
Main functionalities provided by this API:
|
|
||||||
- Create agents with specific instructions and ability to use tools.
|
|
||||||
- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
|
|
||||||
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
|
|
||||||
- Agents can be provided with various shields (see the Safety API for more details).
|
|
||||||
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
|
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **agents** API.
|
This section contains documentation for all available providers for the **agents** API.
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
# Batches
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
Protocol for batch processing API operations.
|
|
||||||
|
|
||||||
The Batches API enables efficient processing of multiple requests in a single operation,
|
|
||||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
|
||||||
cost-effective inference at scale.
|
|
||||||
|
|
||||||
Note: This API is currently under active development and may undergo changes.
|
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **batches** API.
|
|
||||||
|
|
||||||
## Providers
|
|
||||||
|
|
||||||
```{toctree}
|
|
||||||
:maxdepth: 1
|
|
||||||
|
|
||||||
inline_reference
|
|
||||||
```
|
|
|
@ -1,23 +0,0 @@
|
||||||
# inline::reference
|
|
||||||
|
|
||||||
## Description
|
|
||||||
|
|
||||||
Reference implementation of batches API with KVStore persistence.
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
|
||||||
|-------|------|----------|---------|-------------|
|
|
||||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
|
|
||||||
| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
|
|
||||||
| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
|
|
||||||
|
|
||||||
## Sample Configuration
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kvstore:
|
|
||||||
type: sqlite
|
|
||||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
|
@ -2,8 +2,6 @@
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
Llama Stack Evaluation API for running evaluations on model and agent candidates.
|
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **eval** API.
|
This section contains documentation for all available providers for the **eval** API.
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
|
|
@ -226,7 +226,7 @@ uv init
|
||||||
name = "llama-stack-provider-ollama"
|
name = "llama-stack-provider-ollama"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Ollama provider for Llama Stack"
|
description = "Ollama provider for Llama Stack"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.10"
|
||||||
dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
|
dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ Local filesystem-based file storage provider for managing files and documents lo
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `storage_dir` | `<class 'str'>` | No | | Directory to store uploaded files |
|
| `storage_dir` | `<class 'str'>` | No | PydanticUndefined | Directory to store uploaded files |
|
||||||
| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
|
| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata |
|
||||||
| `ttl_secs` | `<class 'int'>` | No | 31536000 | |
|
| `ttl_secs` | `<class 'int'>` | No | 31536000 | |
|
||||||
|
|
||||||
|
|
|
@ -2,12 +2,6 @@
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
Llama Stack Inference API for generating completions, chat completions, and embeddings.
|
|
||||||
|
|
||||||
This API provides the raw interface to the underlying models. Two kinds of models are supported:
|
|
||||||
- LLM models: these models generate "raw" and "chat" (conversational) completions.
|
|
||||||
- Embedding models: these models generate embeddings to be used for semantic search.
|
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **inference** API.
|
This section contains documentation for all available providers for the **inference** API.
|
||||||
|
|
||||||
## Providers
|
## Providers
|
||||||
|
@ -35,7 +29,6 @@ remote_runpod
|
||||||
remote_sambanova
|
remote_sambanova
|
||||||
remote_tgi
|
remote_tgi
|
||||||
remote_together
|
remote_together
|
||||||
remote_vertexai
|
|
||||||
remote_vllm
|
remote_vllm
|
||||||
remote_watsonx
|
remote_watsonx
|
||||||
```
|
```
|
||||||
|
|
|
@ -8,7 +8,7 @@ HuggingFace Inference Endpoints provider for dedicated model serving.
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `endpoint_name` | `<class 'str'>` | No | | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
|
| `endpoint_name` | `<class 'str'>` | No | PydanticUndefined | The name of the Hugging Face Inference Endpoint in the format of '{namespace}/{endpoint_name}' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
|
||||||
| `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
|
| `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -8,7 +8,7 @@ HuggingFace Inference API serverless provider for on-demand model inference.
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `huggingface_repo` | `<class 'str'>` | No | | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
|
| `huggingface_repo` | `<class 'str'>` | No | PydanticUndefined | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
|
||||||
| `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
|
| `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -8,7 +8,7 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `url` | `<class 'str'>` | No | | The URL for the TGI serving endpoint |
|
| `url` | `<class 'str'>` | No | PydanticUndefined | The URL for the TGI serving endpoint |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
|
|
|
@ -1,40 +0,0 @@
|
||||||
# remote::vertexai
|
|
||||||
|
|
||||||
## Description
|
|
||||||
|
|
||||||
Google Vertex AI inference provider enables you to use Google's Gemini models through Google Cloud's Vertex AI platform, providing several advantages:
|
|
||||||
|
|
||||||
• Enterprise-grade security: Uses Google Cloud's security controls and IAM
|
|
||||||
• Better integration: Seamless integration with other Google Cloud services
|
|
||||||
• Advanced features: Access to additional Vertex AI features like model tuning and monitoring
|
|
||||||
• Authentication: Uses Google Cloud Application Default Credentials (ADC) instead of API keys
|
|
||||||
|
|
||||||
Configuration:
|
|
||||||
- Set VERTEX_AI_PROJECT environment variable (required)
|
|
||||||
- Set VERTEX_AI_LOCATION environment variable (optional, defaults to us-central1)
|
|
||||||
- Use Google Cloud Application Default Credentials or service account key
|
|
||||||
|
|
||||||
Authentication Setup:
|
|
||||||
Option 1 (Recommended): gcloud auth application-default login
|
|
||||||
Option 2: Set GOOGLE_APPLICATION_CREDENTIALS to service account key path
|
|
||||||
|
|
||||||
Available Models:
|
|
||||||
- vertex_ai/gemini-2.0-flash
|
|
||||||
- vertex_ai/gemini-2.5-flash
|
|
||||||
- vertex_ai/gemini-2.5-pro
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
|
||||||
|-------|------|----------|---------|-------------|
|
|
||||||
| `project` | `<class 'str'>` | No | | Google Cloud project ID for Vertex AI |
|
|
||||||
| `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |
|
|
||||||
|
|
||||||
## Sample Configuration
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
project: ${env.VERTEX_AI_PROJECT:=}
|
|
||||||
location: ${env.VERTEX_AI_LOCATION:=us-central1}
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
|
||||||
| `dpo_beta` | `<class 'float'>` | No | 0.1 | |
|
| `dpo_beta` | `<class 'float'>` | No | 0.1 | |
|
||||||
| `use_reference_model` | `<class 'bool'>` | No | True | |
|
| `use_reference_model` | `<class 'bool'>` | No | True | |
|
||||||
| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | |
|
| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | |
|
||||||
| `dpo_output_dir` | `<class 'str'>` | No | | |
|
| `dpo_output_dir` | `<class 'str'>` | No | ./checkpoints/dpo | |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
|
@ -35,7 +35,6 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
|
||||||
checkpoint_format: huggingface
|
checkpoint_format: huggingface
|
||||||
distributed_backend: null
|
distributed_backend: null
|
||||||
device: cpu
|
device: cpu
|
||||||
dpo_output_dir: ~/.llama/dummy/dpo_output
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -41,7 +41,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `db_path` | `<class 'str'>` | No | | |
|
| `db_path` | `<class 'str'>` | No | PydanticUndefined | |
|
||||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -12,18 +12,6 @@ That means you'll get fast and efficient vector retrieval.
|
||||||
- Lightweight and easy to use
|
- Lightweight and easy to use
|
||||||
- Fully integrated with Llama Stack
|
- Fully integrated with Llama Stack
|
||||||
- GPU support
|
- GPU support
|
||||||
- **Vector search** - FAISS supports pure vector similarity search using embeddings
|
|
||||||
|
|
||||||
## Search Modes
|
|
||||||
|
|
||||||
**Supported:**
|
|
||||||
- **Vector Search** (`mode="vector"`): Performs vector similarity search using embeddings
|
|
||||||
|
|
||||||
**Not Supported:**
|
|
||||||
- **Keyword Search** (`mode="keyword"`): Not supported by FAISS
|
|
||||||
- **Hybrid Search** (`mode="hybrid"`): Not supported by FAISS
|
|
||||||
|
|
||||||
> **Note**: FAISS is designed as a pure vector similarity search library. See the [FAISS GitHub repository](https://github.com/facebookresearch/faiss) for more details about FAISS's core functionality.
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,5 @@ kvstore:
|
||||||
|
|
||||||
## Deprecation Notice
|
## Deprecation Notice
|
||||||
|
|
||||||
```{warning}
|
⚠️ **Warning**: Please use the `inline::faiss` provider instead.
|
||||||
Please use the `inline::faiss` provider instead.
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ Please refer to the remote provider documentation.
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `db_path` | `<class 'str'>` | No | | |
|
| `db_path` | `<class 'str'>` | No | PydanticUndefined | |
|
||||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
|
||||||
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
|
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `path` | `<class 'str'>` | No | | |
|
| `path` | `<class 'str'>` | No | PydanticUndefined | |
|
||||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | |
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -205,7 +205,7 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `db_path` | `<class 'str'>` | No | | Path to the SQLite database file |
|
| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
|
||||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -10,7 +10,7 @@ Please refer to the sqlite-vec provider documentation.
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `db_path` | `<class 'str'>` | No | | Path to the SQLite database file |
|
| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
|
||||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
@ -25,7 +25,5 @@ kvstore:
|
||||||
|
|
||||||
## Deprecation Notice
|
## Deprecation Notice
|
||||||
|
|
||||||
```{warning}
|
⚠️ **Warning**: Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
|
||||||
Please use the `inline::sqlite-vec` provider (notice the hyphen instead of underscore) instead.
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
|
@ -40,7 +40,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `url` | `str \| None` | No | | |
|
| `url` | `str \| None` | No | PydanticUndefined | |
|
||||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
|
@ -11,7 +11,6 @@ That means you're not limited to storing vectors in memory or in a separate serv
|
||||||
|
|
||||||
- Easy to use
|
- Easy to use
|
||||||
- Fully integrated with Llama Stack
|
- Fully integrated with Llama Stack
|
||||||
- Supports all search modes: vector, keyword, and hybrid search (both inline and remote configurations)
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
@ -102,92 +101,6 @@ vector_io:
|
||||||
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
|
- **`client_pem_path`**: Path to the **client certificate** file (required for mTLS).
|
||||||
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
|
- **`client_key_path`**: Path to the **client private key** file (required for mTLS).
|
||||||
|
|
||||||
## Search Modes
|
|
||||||
|
|
||||||
Milvus supports three different search modes for both inline and remote configurations:
|
|
||||||
|
|
||||||
### Vector Search
|
|
||||||
Vector search uses semantic similarity to find the most relevant chunks based on embedding vectors. This is the default search mode and works well for finding conceptually similar content.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Vector search example
|
|
||||||
search_response = client.vector_stores.search(
|
|
||||||
vector_store_id=vector_store.id,
|
|
||||||
query="What is machine learning?",
|
|
||||||
search_mode="vector",
|
|
||||||
max_num_results=5,
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Keyword Search
|
|
||||||
Keyword search uses traditional text-based matching to find chunks containing specific terms or phrases. This is useful when you need exact term matches.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Keyword search example
|
|
||||||
search_response = client.vector_stores.search(
|
|
||||||
vector_store_id=vector_store.id,
|
|
||||||
query="Python programming language",
|
|
||||||
search_mode="keyword",
|
|
||||||
max_num_results=5,
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Hybrid Search
|
|
||||||
Hybrid search combines both vector and keyword search methods to provide more comprehensive results. It leverages the strengths of both semantic similarity and exact term matching.
|
|
||||||
|
|
||||||
#### Basic Hybrid Search
|
|
||||||
```python
|
|
||||||
# Basic hybrid search example (uses RRF ranker with default impact_factor=60.0)
|
|
||||||
search_response = client.vector_stores.search(
|
|
||||||
vector_store_id=vector_store.id,
|
|
||||||
query="neural networks in Python",
|
|
||||||
search_mode="hybrid",
|
|
||||||
max_num_results=5,
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Note**: The default `impact_factor` value of 60.0 was empirically determined to be optimal in the original RRF research paper: ["Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) (Cormack et al., 2009).
|
|
||||||
|
|
||||||
#### Hybrid Search with RRF (Reciprocal Rank Fusion) Ranker
|
|
||||||
RRF combines rankings from vector and keyword search by using reciprocal ranks. The impact factor controls how much weight is given to higher-ranked results.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Hybrid search with custom RRF parameters
|
|
||||||
search_response = client.vector_stores.search(
|
|
||||||
vector_store_id=vector_store.id,
|
|
||||||
query="neural networks in Python",
|
|
||||||
search_mode="hybrid",
|
|
||||||
max_num_results=5,
|
|
||||||
ranking_options={
|
|
||||||
"ranker": {
|
|
||||||
"type": "rrf",
|
|
||||||
"impact_factor": 100.0, # Higher values give more weight to top-ranked results
|
|
||||||
}
|
|
||||||
},
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Hybrid Search with Weighted Ranker
|
|
||||||
Weighted ranker linearly combines normalized scores from vector and keyword search. The alpha parameter controls the balance between the two search methods.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Hybrid search with weighted ranker
|
|
||||||
search_response = client.vector_stores.search(
|
|
||||||
vector_store_id=vector_store.id,
|
|
||||||
query="neural networks in Python",
|
|
||||||
search_mode="hybrid",
|
|
||||||
max_num_results=5,
|
|
||||||
ranking_options={
|
|
||||||
"ranker": {
|
|
||||||
"type": "weighted",
|
|
||||||
"alpha": 0.7, # 70% vector search, 30% keyword search
|
|
||||||
}
|
|
||||||
},
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
For detailed documentation on RRF and Weighted rankers, please refer to the [Milvus Reranking Guide](https://milvus.io/docs/reranking.md).
|
|
||||||
|
|
||||||
## Documentation
|
## Documentation
|
||||||
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
|
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.
|
||||||
|
|
||||||
|
@ -198,16 +111,13 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
| Field | Type | Required | Default | Description |
|
||||||
|-------|------|----------|---------|-------------|
|
|-------|------|----------|---------|-------------|
|
||||||
| `uri` | `<class 'str'>` | No | | The URI of the Milvus server |
|
| `uri` | `<class 'str'>` | No | PydanticUndefined | The URI of the Milvus server |
|
||||||
| `token` | `str \| None` | No | | The token of the Milvus server |
|
| `token` | `str \| None` | No | PydanticUndefined | The token of the Milvus server |
|
||||||
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
|
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
|
||||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
|
||||||
| `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
|
| `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
|
||||||
|
|
||||||
```{note}
|
> **Note**: This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
|
||||||
This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
## Sample Configuration
|
## Sample Configuration
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ You have two ways to install Llama Stack:
|
||||||
cd ~/local
|
cd ~/local
|
||||||
git clone git@github.com:meta-llama/llama-stack.git
|
git clone git@github.com:meta-llama/llama-stack.git
|
||||||
|
|
||||||
uv venv myenv --python 3.12
|
python -m venv myenv
|
||||||
source myenv/bin/activate # On Windows: myenv\Scripts\activate
|
source myenv/bin/activate # On Windows: myenv\Scripts\activate
|
||||||
|
|
||||||
cd llama-stack
|
cd llama-stack
|
||||||
|
@ -128,9 +128,7 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
|
||||||
|
|
||||||
**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
|
**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
|
||||||
|
|
||||||
```{tip}
|
> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
|
||||||
Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
|
|
||||||
```
|
|
||||||
|
|
||||||
## List the downloaded models
|
## List the downloaded models
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ You have two ways to install Llama Stack:
|
||||||
cd ~/local
|
cd ~/local
|
||||||
git clone git@github.com:meta-llama/llama-stack.git
|
git clone git@github.com:meta-llama/llama-stack.git
|
||||||
|
|
||||||
uv venv myenv --python 3.12
|
python -m venv myenv
|
||||||
source myenv/bin/activate # On Windows: myenv\Scripts\activate
|
source myenv/bin/activate # On Windows: myenv\Scripts\activate
|
||||||
|
|
||||||
cd llama-stack
|
cd llama-stack
|
||||||
|
@ -152,9 +152,7 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
|
||||||
|
|
||||||
**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
|
**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
|
||||||
|
|
||||||
```{tip}
|
> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
|
||||||
Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
|
|
||||||
```
|
|
||||||
|
|
||||||
## List the downloaded models
|
## List the downloaded models
|
||||||
|
|
||||||
|
|
|
@ -706,7 +706,6 @@ class Agents(Protocol):
|
||||||
temperature: float | None = None,
|
temperature: float | None = None,
|
||||||
text: OpenAIResponseText | None = None,
|
text: OpenAIResponseText | None = None,
|
||||||
tools: list[OpenAIResponseInputTool] | None = None,
|
tools: list[OpenAIResponseInputTool] | None = None,
|
||||||
include: list[str] | None = None,
|
|
||||||
max_infer_iters: int | None = 10, # this is an extension to the OpenAI API
|
max_infer_iters: int | None = 10, # this is an extension to the OpenAI API
|
||||||
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
|
||||||
"""Create a new OpenAI response.
|
"""Create a new OpenAI response.
|
||||||
|
@ -714,7 +713,6 @@ class Agents(Protocol):
|
||||||
:param input: Input message(s) to create the response.
|
:param input: Input message(s) to create the response.
|
||||||
:param model: The underlying LLM used for completions.
|
:param model: The underlying LLM used for completions.
|
||||||
:param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
|
:param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
|
||||||
:param include: (Optional) Additional fields to include in the response.
|
|
||||||
:returns: An OpenAIResponseObject.
|
:returns: An OpenAIResponseObject.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
|
@ -170,23 +170,6 @@ class OpenAIResponseOutputMessageWebSearchToolCall(BaseModel):
|
||||||
type: Literal["web_search_call"] = "web_search_call"
|
type: Literal["web_search_call"] = "web_search_call"
|
||||||
|
|
||||||
|
|
||||||
class OpenAIResponseOutputMessageFileSearchToolCallResults(BaseModel):
|
|
||||||
"""Search results returned by the file search operation.
|
|
||||||
|
|
||||||
:param attributes: (Optional) Key-value attributes associated with the file
|
|
||||||
:param file_id: Unique identifier of the file containing the result
|
|
||||||
:param filename: Name of the file containing the result
|
|
||||||
:param score: Relevance score for this search result (between 0 and 1)
|
|
||||||
:param text: Text content of the search result
|
|
||||||
"""
|
|
||||||
|
|
||||||
attributes: dict[str, Any]
|
|
||||||
file_id: str
|
|
||||||
filename: str
|
|
||||||
score: float
|
|
||||||
text: str
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
|
class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
|
||||||
"""File search tool call output message for OpenAI responses.
|
"""File search tool call output message for OpenAI responses.
|
||||||
|
@ -202,7 +185,7 @@ class OpenAIResponseOutputMessageFileSearchToolCall(BaseModel):
|
||||||
queries: list[str]
|
queries: list[str]
|
||||||
status: str
|
status: str
|
||||||
type: Literal["file_search_call"] = "file_search_call"
|
type: Literal["file_search_call"] = "file_search_call"
|
||||||
results: list[OpenAIResponseOutputMessageFileSearchToolCallResults] | None = None
|
results: list[dict[str, Any]] | None = None
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
@ -623,62 +606,6 @@ class OpenAIResponseObjectStreamResponseMcpCallCompleted(BaseModel):
|
||||||
type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
|
type: Literal["response.mcp_call.completed"] = "response.mcp_call.completed"
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class OpenAIResponseContentPartOutputText(BaseModel):
|
|
||||||
type: Literal["output_text"] = "output_text"
|
|
||||||
text: str
|
|
||||||
# TODO: add annotations, logprobs, etc.
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class OpenAIResponseContentPartRefusal(BaseModel):
|
|
||||||
type: Literal["refusal"] = "refusal"
|
|
||||||
refusal: str
|
|
||||||
|
|
||||||
|
|
||||||
OpenAIResponseContentPart = Annotated[
|
|
||||||
OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal,
|
|
||||||
Field(discriminator="type"),
|
|
||||||
]
|
|
||||||
register_schema(OpenAIResponseContentPart, name="OpenAIResponseContentPart")
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class OpenAIResponseObjectStreamResponseContentPartAdded(BaseModel):
|
|
||||||
"""Streaming event for when a new content part is added to a response item.
|
|
||||||
|
|
||||||
:param response_id: Unique identifier of the response containing this content
|
|
||||||
:param item_id: Unique identifier of the output item containing this content part
|
|
||||||
:param part: The content part that was added
|
|
||||||
:param sequence_number: Sequential number for ordering streaming events
|
|
||||||
:param type: Event type identifier, always "response.content_part.added"
|
|
||||||
"""
|
|
||||||
|
|
||||||
response_id: str
|
|
||||||
item_id: str
|
|
||||||
part: OpenAIResponseContentPart
|
|
||||||
sequence_number: int
|
|
||||||
type: Literal["response.content_part.added"] = "response.content_part.added"
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class OpenAIResponseObjectStreamResponseContentPartDone(BaseModel):
|
|
||||||
"""Streaming event for when a content part is completed.
|
|
||||||
|
|
||||||
:param response_id: Unique identifier of the response containing this content
|
|
||||||
:param item_id: Unique identifier of the output item containing this content part
|
|
||||||
:param part: The completed content part
|
|
||||||
:param sequence_number: Sequential number for ordering streaming events
|
|
||||||
:param type: Event type identifier, always "response.content_part.done"
|
|
||||||
"""
|
|
||||||
|
|
||||||
response_id: str
|
|
||||||
item_id: str
|
|
||||||
part: OpenAIResponseContentPart
|
|
||||||
sequence_number: int
|
|
||||||
type: Literal["response.content_part.done"] = "response.content_part.done"
|
|
||||||
|
|
||||||
|
|
||||||
OpenAIResponseObjectStream = Annotated[
|
OpenAIResponseObjectStream = Annotated[
|
||||||
OpenAIResponseObjectStreamResponseCreated
|
OpenAIResponseObjectStreamResponseCreated
|
||||||
| OpenAIResponseObjectStreamResponseOutputItemAdded
|
| OpenAIResponseObjectStreamResponseOutputItemAdded
|
||||||
|
@ -698,8 +625,6 @@ OpenAIResponseObjectStream = Annotated[
|
||||||
| OpenAIResponseObjectStreamResponseMcpCallInProgress
|
| OpenAIResponseObjectStreamResponseMcpCallInProgress
|
||||||
| OpenAIResponseObjectStreamResponseMcpCallFailed
|
| OpenAIResponseObjectStreamResponseMcpCallFailed
|
||||||
| OpenAIResponseObjectStreamResponseMcpCallCompleted
|
| OpenAIResponseObjectStreamResponseMcpCallCompleted
|
||||||
| OpenAIResponseObjectStreamResponseContentPartAdded
|
|
||||||
| OpenAIResponseObjectStreamResponseContentPartDone
|
|
||||||
| OpenAIResponseObjectStreamResponseCompleted,
|
| OpenAIResponseObjectStreamResponseCompleted,
|
||||||
Field(discriminator="type"),
|
Field(discriminator="type"),
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,9 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
from .batches import Batches, BatchObject, ListBatchesResponse
|
|
||||||
|
|
||||||
__all__ = ["Batches", "BatchObject", "ListBatchesResponse"]
|
|
|
@ -1,89 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
from typing import Literal, Protocol, runtime_checkable
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
from llama_stack.schema_utils import json_schema_type, webmethod
|
|
||||||
|
|
||||||
try:
|
|
||||||
from openai.types import Batch as BatchObject
|
|
||||||
except ImportError as e:
|
|
||||||
raise ImportError("OpenAI package is required for batches API. Please install it with: pip install openai") from e
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class ListBatchesResponse(BaseModel):
|
|
||||||
"""Response containing a list of batch objects."""
|
|
||||||
|
|
||||||
object: Literal["list"] = "list"
|
|
||||||
data: list[BatchObject] = Field(..., description="List of batch objects")
|
|
||||||
first_id: str | None = Field(default=None, description="ID of the first batch in the list")
|
|
||||||
last_id: str | None = Field(default=None, description="ID of the last batch in the list")
|
|
||||||
has_more: bool = Field(default=False, description="Whether there are more batches available")
|
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
|
||||||
class Batches(Protocol):
|
|
||||||
"""Protocol for batch processing API operations.
|
|
||||||
|
|
||||||
The Batches API enables efficient processing of multiple requests in a single operation,
|
|
||||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
|
||||||
cost-effective inference at scale.
|
|
||||||
|
|
||||||
Note: This API is currently under active development and may undergo changes.
|
|
||||||
"""
|
|
||||||
|
|
||||||
@webmethod(route="/openai/v1/batches", method="POST")
|
|
||||||
async def create_batch(
|
|
||||||
self,
|
|
||||||
input_file_id: str,
|
|
||||||
endpoint: str,
|
|
||||||
completion_window: Literal["24h"],
|
|
||||||
metadata: dict[str, str] | None = None,
|
|
||||||
) -> BatchObject:
|
|
||||||
"""Create a new batch for processing multiple API requests.
|
|
||||||
|
|
||||||
:param input_file_id: The ID of an uploaded file containing requests for the batch.
|
|
||||||
:param endpoint: The endpoint to be used for all requests in the batch.
|
|
||||||
:param completion_window: The time window within which the batch should be processed.
|
|
||||||
:param metadata: Optional metadata for the batch.
|
|
||||||
:returns: The created batch object.
|
|
||||||
"""
|
|
||||||
...
|
|
||||||
|
|
||||||
@webmethod(route="/openai/v1/batches/{batch_id}", method="GET")
|
|
||||||
async def retrieve_batch(self, batch_id: str) -> BatchObject:
|
|
||||||
"""Retrieve information about a specific batch.
|
|
||||||
|
|
||||||
:param batch_id: The ID of the batch to retrieve.
|
|
||||||
:returns: The batch object.
|
|
||||||
"""
|
|
||||||
...
|
|
||||||
|
|
||||||
@webmethod(route="/openai/v1/batches/{batch_id}/cancel", method="POST")
|
|
||||||
async def cancel_batch(self, batch_id: str) -> BatchObject:
|
|
||||||
"""Cancel a batch that is in progress.
|
|
||||||
|
|
||||||
:param batch_id: The ID of the batch to cancel.
|
|
||||||
:returns: The updated batch object.
|
|
||||||
"""
|
|
||||||
...
|
|
||||||
|
|
||||||
@webmethod(route="/openai/v1/batches", method="GET")
|
|
||||||
async def list_batches(
|
|
||||||
self,
|
|
||||||
after: str | None = None,
|
|
||||||
limit: int = 20,
|
|
||||||
) -> ListBatchesResponse:
|
|
||||||
"""List all batches for the current user.
|
|
||||||
|
|
||||||
:param after: A cursor for pagination; returns batches after this batch ID.
|
|
||||||
:param limit: Number of batches to return (default 20, max 100).
|
|
||||||
:returns: A list of batch objects.
|
|
||||||
"""
|
|
||||||
...
|
|
|
@ -10,16 +10,6 @@
|
||||||
# 3. All classes should propogate the inherited __init__ function otherwise via 'super().__init__(message)'
|
# 3. All classes should propogate the inherited __init__ function otherwise via 'super().__init__(message)'
|
||||||
|
|
||||||
|
|
||||||
class ResourceNotFoundError(ValueError):
|
|
||||||
"""generic exception for a missing Llama Stack resource"""
|
|
||||||
|
|
||||||
def __init__(self, resource_name: str, resource_type: str, client_list: str) -> None:
|
|
||||||
message = (
|
|
||||||
f"{resource_type} '{resource_name}' not found. Use '{client_list}' to list available {resource_type}s."
|
|
||||||
)
|
|
||||||
super().__init__(message)
|
|
||||||
|
|
||||||
|
|
||||||
class UnsupportedModelError(ValueError):
|
class UnsupportedModelError(ValueError):
|
||||||
"""raised when model is not present in the list of supported models"""
|
"""raised when model is not present in the list of supported models"""
|
||||||
|
|
||||||
|
@ -28,32 +18,38 @@ class UnsupportedModelError(ValueError):
|
||||||
super().__init__(message)
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
class ModelNotFoundError(ResourceNotFoundError):
|
class ModelNotFoundError(ValueError):
|
||||||
"""raised when Llama Stack cannot find a referenced model"""
|
"""raised when Llama Stack cannot find a referenced model"""
|
||||||
|
|
||||||
def __init__(self, model_name: str) -> None:
|
def __init__(self, model_name: str) -> None:
|
||||||
super().__init__(model_name, "Model", "client.models.list()")
|
message = f"Model '{model_name}' not found. Use client.models.list() to list available models."
|
||||||
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
class VectorStoreNotFoundError(ResourceNotFoundError):
|
class VectorStoreNotFoundError(ValueError):
|
||||||
"""raised when Llama Stack cannot find a referenced vector store"""
|
"""raised when Llama Stack cannot find a referenced vector store"""
|
||||||
|
|
||||||
def __init__(self, vector_store_name: str) -> None:
|
def __init__(self, vector_store_name: str) -> None:
|
||||||
super().__init__(vector_store_name, "Vector Store", "client.vector_dbs.list()")
|
message = f"Vector store '{vector_store_name}' not found. Use client.vector_dbs.list() to list available vector stores."
|
||||||
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
class DatasetNotFoundError(ResourceNotFoundError):
|
class DatasetNotFoundError(ValueError):
|
||||||
"""raised when Llama Stack cannot find a referenced dataset"""
|
"""raised when Llama Stack cannot find a referenced dataset"""
|
||||||
|
|
||||||
def __init__(self, dataset_name: str) -> None:
|
def __init__(self, dataset_name: str) -> None:
|
||||||
super().__init__(dataset_name, "Dataset", "client.datasets.list()")
|
message = f"Dataset '{dataset_name}' not found. Use client.datasets.list() to list available datasets."
|
||||||
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
class ToolGroupNotFoundError(ResourceNotFoundError):
|
class ToolGroupNotFoundError(ValueError):
|
||||||
"""raised when Llama Stack cannot find a referenced tool group"""
|
"""raised when Llama Stack cannot find a referenced tool group"""
|
||||||
|
|
||||||
def __init__(self, toolgroup_name: str) -> None:
|
def __init__(self, toolgroup_name: str) -> None:
|
||||||
super().__init__(toolgroup_name, "Tool Group", "client.toolgroups.list()")
|
message = (
|
||||||
|
f"Tool group '{toolgroup_name}' not found. Use client.toolgroups.list() to list available tool groups."
|
||||||
|
)
|
||||||
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
class SessionNotFoundError(ValueError):
|
class SessionNotFoundError(ValueError):
|
||||||
|
@ -62,20 +58,3 @@ class SessionNotFoundError(ValueError):
|
||||||
def __init__(self, session_name: str) -> None:
|
def __init__(self, session_name: str) -> None:
|
||||||
message = f"Session '{session_name}' not found or access denied."
|
message = f"Session '{session_name}' not found or access denied."
|
||||||
super().__init__(message)
|
super().__init__(message)
|
||||||
|
|
||||||
|
|
||||||
class ModelTypeError(TypeError):
|
|
||||||
"""raised when a model is present but not the correct type"""
|
|
||||||
|
|
||||||
def __init__(self, model_name: str, model_type: str, expected_model_type: str) -> None:
|
|
||||||
message = (
|
|
||||||
f"Model '{model_name}' is of type '{model_type}' rather than the expected type '{expected_model_type}'"
|
|
||||||
)
|
|
||||||
super().__init__(message)
|
|
||||||
|
|
||||||
|
|
||||||
class ConflictError(ValueError):
|
|
||||||
"""raised when an operation cannot be performed due to a conflict with the current state"""
|
|
||||||
|
|
||||||
def __init__(self, message: str) -> None:
|
|
||||||
super().__init__(message)
|
|
||||||
|
|
|
@ -86,7 +86,6 @@ class Api(Enum, metaclass=DynamicApiMeta):
|
||||||
:cvar inference: Text generation, chat completions, and embeddings
|
:cvar inference: Text generation, chat completions, and embeddings
|
||||||
:cvar safety: Content moderation and safety shields
|
:cvar safety: Content moderation and safety shields
|
||||||
:cvar agents: Agent orchestration and execution
|
:cvar agents: Agent orchestration and execution
|
||||||
:cvar batches: Batch processing for asynchronous API requests
|
|
||||||
:cvar vector_io: Vector database operations and queries
|
:cvar vector_io: Vector database operations and queries
|
||||||
:cvar datasetio: Dataset input/output operations
|
:cvar datasetio: Dataset input/output operations
|
||||||
:cvar scoring: Model output evaluation and scoring
|
:cvar scoring: Model output evaluation and scoring
|
||||||
|
@ -109,7 +108,6 @@ class Api(Enum, metaclass=DynamicApiMeta):
|
||||||
inference = "inference"
|
inference = "inference"
|
||||||
safety = "safety"
|
safety = "safety"
|
||||||
agents = "agents"
|
agents = "agents"
|
||||||
batches = "batches"
|
|
||||||
vector_io = "vector_io"
|
vector_io = "vector_io"
|
||||||
datasetio = "datasetio"
|
datasetio = "datasetio"
|
||||||
scoring = "scoring"
|
scoring = "scoring"
|
||||||
|
|
|
@ -22,7 +22,6 @@ class OpenAIFilePurpose(StrEnum):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ASSISTANTS = "assistants"
|
ASSISTANTS = "assistants"
|
||||||
BATCH = "batch"
|
|
||||||
# TODO: Add other purposes as needed
|
# TODO: Add other purposes as needed
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,36 +15,6 @@ from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
|
||||||
from llama_stack.schema_utils import json_schema_type, webmethod
|
from llama_stack.schema_utils import json_schema_type, webmethod
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class ModerationObjectResults(BaseModel):
|
|
||||||
"""A moderation object.
|
|
||||||
:param flagged: Whether any of the below categories are flagged.
|
|
||||||
:param categories: A list of the categories, and whether they are flagged or not.
|
|
||||||
:param category_applied_input_types: A list of the categories along with the input type(s) that the score applies to.
|
|
||||||
:param category_scores: A list of the categories along with their scores as predicted by model.
|
|
||||||
"""
|
|
||||||
|
|
||||||
flagged: bool
|
|
||||||
categories: dict[str, bool] | None = None
|
|
||||||
category_applied_input_types: dict[str, list[str]] | None = None
|
|
||||||
category_scores: dict[str, float] | None = None
|
|
||||||
user_message: str | None = None
|
|
||||||
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class ModerationObject(BaseModel):
|
|
||||||
"""A moderation object.
|
|
||||||
:param id: The unique identifier for the moderation request.
|
|
||||||
:param model: The model used to generate the moderation results.
|
|
||||||
:param results: A list of moderation objects
|
|
||||||
"""
|
|
||||||
|
|
||||||
id: str
|
|
||||||
model: str
|
|
||||||
results: list[ModerationObjectResults]
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class ViolationLevel(Enum):
|
class ViolationLevel(Enum):
|
||||||
"""Severity level of a safety violation.
|
"""Severity level of a safety violation.
|
||||||
|
@ -112,13 +82,3 @@ class Safety(Protocol):
|
||||||
:returns: A RunShieldResponse.
|
:returns: A RunShieldResponse.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
@webmethod(route="/openai/v1/moderations", method="POST")
|
|
||||||
async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
|
|
||||||
"""Classifies if text and/or image inputs are potentially harmful.
|
|
||||||
:param input: Input (or inputs) to classify.
|
|
||||||
Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
|
|
||||||
:param model: The content moderation model you would like to use.
|
|
||||||
:returns: A moderation object.
|
|
||||||
"""
|
|
||||||
...
|
|
||||||
|
|
|
@ -83,11 +83,3 @@ class Shields(Protocol):
|
||||||
:returns: A Shield.
|
:returns: A Shield.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
@webmethod(route="/shields/{identifier:path}", method="DELETE")
|
|
||||||
async def unregister_shield(self, identifier: str) -> None:
|
|
||||||
"""Unregister a shield.
|
|
||||||
|
|
||||||
:param identifier: The identifier of the shield to unregister.
|
|
||||||
"""
|
|
||||||
...
|
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import importlib.resources
|
import importlib.resources
|
||||||
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
@ -16,10 +17,9 @@ from llama_stack.core.external import load_external_apis
|
||||||
from llama_stack.core.utils.exec import run_command
|
from llama_stack.core.utils.exec import run_command
|
||||||
from llama_stack.core.utils.image_types import LlamaStackImageType
|
from llama_stack.core.utils.image_types import LlamaStackImageType
|
||||||
from llama_stack.distributions.template import DistributionTemplate
|
from llama_stack.distributions.template import DistributionTemplate
|
||||||
from llama_stack.log import get_logger
|
|
||||||
from llama_stack.providers.datatypes import Api
|
from llama_stack.providers.datatypes import Api
|
||||||
|
|
||||||
log = get_logger(name=__name__, category="core")
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
# These are the dependencies needed by the distribution server.
|
# These are the dependencies needed by the distribution server.
|
||||||
# `llama-stack` is automatically installed by the installation script.
|
# `llama-stack` is automatically installed by the installation script.
|
||||||
|
@ -91,7 +91,7 @@ def get_provider_dependencies(
|
||||||
|
|
||||||
|
|
||||||
def print_pip_install_help(config: BuildConfig):
|
def print_pip_install_help(config: BuildConfig):
|
||||||
normal_deps, special_deps, _ = get_provider_dependencies(config)
|
normal_deps, special_deps = get_provider_dependencies(config)
|
||||||
|
|
||||||
cprint(
|
cprint(
|
||||||
f"Please install needed dependencies using the following commands:\n\nuv pip install {' '.join(normal_deps)}",
|
f"Please install needed dependencies using the following commands:\n\nuv pip install {' '.join(normal_deps)}",
|
||||||
|
|
207
llama_stack/core/build_conda_env.sh
Executable file
207
llama_stack/core/build_conda_env.sh
Executable file
|
@ -0,0 +1,207 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
|
||||||
|
LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
|
||||||
|
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
|
||||||
|
PYPI_VERSION=${PYPI_VERSION:-}
|
||||||
|
# This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
|
||||||
|
# Reference: https://github.com/astral-sh/uv/pull/1694
|
||||||
|
UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Define color codes
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
|
||||||
|
source "$SCRIPT_DIR/common.sh"
|
||||||
|
|
||||||
|
# Usage function
|
||||||
|
usage() {
|
||||||
|
echo "Usage: $0 --env-name <conda_env_name> --build-file-path <build_file_path> --normal-deps <pip_dependencies> [--external-provider-deps <external_provider_deps>] [--optional-deps <special_pip_deps>]"
|
||||||
|
echo "Example: $0 --env-name my-conda-env --build-file-path ./my-stack-build.yaml --normal-deps 'numpy pandas scipy' --external-provider-deps 'foo' --optional-deps 'bar'"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse arguments
|
||||||
|
env_name=""
|
||||||
|
build_file_path=""
|
||||||
|
normal_deps=""
|
||||||
|
external_provider_deps=""
|
||||||
|
optional_deps=""
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
key="$1"
|
||||||
|
case "$key" in
|
||||||
|
--env-name)
|
||||||
|
if [[ -z "$2" || "$2" == --* ]]; then
|
||||||
|
echo "Error: --env-name requires a string value" >&2
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
env_name="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--build-file-path)
|
||||||
|
if [[ -z "$2" || "$2" == --* ]]; then
|
||||||
|
echo "Error: --build-file-path requires a string value" >&2
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
build_file_path="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--normal-deps)
|
||||||
|
if [[ -z "$2" || "$2" == --* ]]; then
|
||||||
|
echo "Error: --normal-deps requires a string value" >&2
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
normal_deps="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--external-provider-deps)
|
||||||
|
if [[ -z "$2" || "$2" == --* ]]; then
|
||||||
|
echo "Error: --external-provider-deps requires a string value" >&2
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
external_provider_deps="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--optional-deps)
|
||||||
|
if [[ -z "$2" || "$2" == --* ]]; then
|
||||||
|
echo "Error: --optional-deps requires a string value" >&2
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
optional_deps="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option: $1" >&2
|
||||||
|
usage
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check required arguments
|
||||||
|
if [[ -z "$env_name" || -z "$build_file_path" || -z "$normal_deps" ]]; then
|
||||||
|
echo "Error: --env-name, --build-file-path, and --normal-deps are required." >&2
|
||||||
|
usage
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$LLAMA_STACK_DIR" ]; then
|
||||||
|
echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
|
||||||
|
fi
|
||||||
|
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
|
||||||
|
echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
ensure_conda_env_python310() {
|
||||||
|
# Use only global variables set by flag parser
|
||||||
|
local python_version="3.12"
|
||||||
|
|
||||||
|
if ! is_command_available conda; then
|
||||||
|
printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if conda env list | grep -q "^${env_name} "; then
|
||||||
|
printf "Conda environment '${env_name}' exists. Checking Python version...\n"
|
||||||
|
current_version=$(conda run -n "${env_name}" python --version 2>&1 | cut -d' ' -f2 | cut -d'.' -f1,2)
|
||||||
|
if [ "$current_version" = "$python_version" ]; then
|
||||||
|
printf "Environment '${env_name}' already has Python ${python_version}. No action needed.\n"
|
||||||
|
else
|
||||||
|
printf "Updating environment '${env_name}' to Python ${python_version}...\n"
|
||||||
|
conda install -n "${env_name}" python="${python_version}" -y
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
printf "Conda environment '${env_name}' does not exist. Creating with Python ${python_version}...\n"
|
||||||
|
conda create -n "${env_name}" python="${python_version}" -y
|
||||||
|
fi
|
||||||
|
|
||||||
|
eval "$(conda shell.bash hook)"
|
||||||
|
conda deactivate && conda activate "${env_name}"
|
||||||
|
"$CONDA_PREFIX"/bin/pip install uv
|
||||||
|
|
||||||
|
if [ -n "$TEST_PYPI_VERSION" ]; then
|
||||||
|
uv pip install fastapi libcst
|
||||||
|
uv pip install --extra-index-url https://test.pypi.org/simple/ \
|
||||||
|
llama-stack=="$TEST_PYPI_VERSION" \
|
||||||
|
"$normal_deps"
|
||||||
|
if [ -n "$optional_deps" ]; then
|
||||||
|
IFS='#' read -ra parts <<<"$optional_deps"
|
||||||
|
for part in "${parts[@]}"; do
|
||||||
|
echo "$part"
|
||||||
|
uv pip install $part
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
if [ -n "$external_provider_deps" ]; then
|
||||||
|
IFS='#' read -ra parts <<<"$external_provider_deps"
|
||||||
|
for part in "${parts[@]}"; do
|
||||||
|
echo "$part"
|
||||||
|
uv pip install "$part"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if [ -n "$LLAMA_STACK_DIR" ]; then
|
||||||
|
if [ ! -d "$LLAMA_STACK_DIR" ]; then
|
||||||
|
printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: $LLAMA_STACK_DIR${NC}\n" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
printf "Installing from LLAMA_STACK_DIR: $LLAMA_STACK_DIR\n"
|
||||||
|
uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
|
||||||
|
else
|
||||||
|
PYPI_VERSION="${PYPI_VERSION:-}"
|
||||||
|
if [ -n "$PYPI_VERSION" ]; then
|
||||||
|
SPEC_VERSION="llama-stack==${PYPI_VERSION}"
|
||||||
|
else
|
||||||
|
SPEC_VERSION="llama-stack"
|
||||||
|
fi
|
||||||
|
uv pip install --no-cache-dir "$SPEC_VERSION"
|
||||||
|
fi
|
||||||
|
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
|
||||||
|
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
|
||||||
|
printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
|
||||||
|
uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
|
||||||
|
fi
|
||||||
|
printf "Installing pip dependencies\n"
|
||||||
|
uv pip install $normal_deps
|
||||||
|
if [ -n "$optional_deps" ]; then
|
||||||
|
IFS='#' read -ra parts <<<"$optional_deps"
|
||||||
|
for part in "${parts[@]}"; do
|
||||||
|
echo "$part"
|
||||||
|
uv pip install $part
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
if [ -n "$external_provider_deps" ]; then
|
||||||
|
IFS='#' read -ra parts <<<"$external_provider_deps"
|
||||||
|
for part in "${parts[@]}"; do
|
||||||
|
echo "Getting provider spec for module: $part and installing dependencies"
|
||||||
|
package_name=$(echo "$part" | sed 's/[<>=!].*//')
|
||||||
|
python3 -c "
|
||||||
|
import importlib
|
||||||
|
import sys
|
||||||
|
try:
|
||||||
|
module = importlib.import_module(f'$package_name.provider')
|
||||||
|
spec = module.get_provider_spec()
|
||||||
|
if hasattr(spec, 'pip_packages') and spec.pip_packages:
|
||||||
|
print('\\n'.join(spec.pip_packages))
|
||||||
|
except Exception as e:
|
||||||
|
print(f'Error getting provider spec for $package_name: {e}', file=sys.stderr)
|
||||||
|
" | uv pip install -r -
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
mv "$build_file_path" "$CONDA_PREFIX"/llamastack-build.yaml
|
||||||
|
echo "Build spec configuration saved at $CONDA_PREFIX/llamastack-build.yaml"
|
||||||
|
}
|
||||||
|
|
||||||
|
ensure_conda_env_python310 "$env_name" "$build_file_path" "$normal_deps" "$optional_deps" "$external_provider_deps"
|
|
@ -151,37 +151,23 @@ run() {
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
if [ -n "$LLAMA_STACK_DIR" ]; then
|
if [ -n "$LLAMA_STACK_DIR" ]; then
|
||||||
# only warn if DIR does not start with "git+"
|
if [ ! -d "$LLAMA_STACK_DIR" ]; then
|
||||||
if [ ! -d "$LLAMA_STACK_DIR" ] && [[ "$LLAMA_STACK_DIR" != git+* ]]; then
|
|
||||||
printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
|
printf "${RED}Warning: LLAMA_STACK_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_DIR" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
printf "Installing from LLAMA_STACK_DIR: %s\n" "$LLAMA_STACK_DIR"
|
printf "Installing from LLAMA_STACK_DIR: %s\n" "$LLAMA_STACK_DIR"
|
||||||
# editable only if LLAMA_STACK_DIR does not start with "git+"
|
uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"
|
||||||
if [[ "$LLAMA_STACK_DIR" != git+* ]]; then
|
|
||||||
EDITABLE="-e"
|
|
||||||
else
|
|
||||||
EDITABLE=""
|
|
||||||
fi
|
|
||||||
uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_DIR"
|
|
||||||
else
|
else
|
||||||
uv pip install --no-cache-dir llama-stack
|
uv pip install --no-cache-dir llama-stack
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
|
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
|
||||||
# only warn if DIR does not start with "git+"
|
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
|
||||||
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ] && [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
|
|
||||||
printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
|
printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
|
printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
|
||||||
# editable only if LLAMA_STACK_CLIENT_DIR does not start with "git+"
|
uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
|
||||||
if [[ "$LLAMA_STACK_CLIENT_DIR" != git+* ]]; then
|
|
||||||
EDITABLE="-e"
|
|
||||||
else
|
|
||||||
EDITABLE=""
|
|
||||||
fi
|
|
||||||
uv pip install --no-cache-dir $EDITABLE "$LLAMA_STACK_CLIENT_DIR"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
printf "Installing pip dependencies\n"
|
printf "Installing pip dependencies\n"
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#
|
#
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
import logging
|
||||||
import textwrap
|
import textwrap
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
@ -20,10 +21,9 @@ from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
|
||||||
from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
|
from llama_stack.core.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
|
||||||
from llama_stack.core.utils.dynamic import instantiate_class_type
|
from llama_stack.core.utils.dynamic import instantiate_class_type
|
||||||
from llama_stack.core.utils.prompt_for_config import prompt_for_config
|
from llama_stack.core.utils.prompt_for_config import prompt_for_config
|
||||||
from llama_stack.log import get_logger
|
|
||||||
from llama_stack.providers.datatypes import Api, ProviderSpec
|
from llama_stack.providers.datatypes import Api, ProviderSpec
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:
|
def configure_single_provider(registry: dict[str, ProviderSpec], provider: Provider) -> Provider:
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import inspect
|
import inspect
|
||||||
import json
|
import json
|
||||||
import logging # allow-direct-logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
@ -48,7 +48,6 @@ from llama_stack.core.stack import (
|
||||||
from llama_stack.core.utils.config import redact_sensitive_fields
|
from llama_stack.core.utils.config import redact_sensitive_fields
|
||||||
from llama_stack.core.utils.context import preserve_contexts_async_generator
|
from llama_stack.core.utils.context import preserve_contexts_async_generator
|
||||||
from llama_stack.core.utils.exec import in_notebook
|
from llama_stack.core.utils.exec import in_notebook
|
||||||
from llama_stack.log import get_logger
|
|
||||||
from llama_stack.providers.utils.telemetry.tracing import (
|
from llama_stack.providers.utils.telemetry.tracing import (
|
||||||
CURRENT_TRACE_CONTEXT,
|
CURRENT_TRACE_CONTEXT,
|
||||||
end_trace,
|
end_trace,
|
||||||
|
@ -56,7 +55,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
|
||||||
start_trace,
|
start_trace,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="core")
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
|
|
||||||
|
@ -381,17 +380,8 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
||||||
json_content = json.dumps(convert_pydantic_to_json_value(result))
|
json_content = json.dumps(convert_pydantic_to_json_value(result))
|
||||||
|
|
||||||
filtered_body = {k: v for k, v in body.items() if not isinstance(v, LibraryClientUploadFile)}
|
filtered_body = {k: v for k, v in body.items() if not isinstance(v, LibraryClientUploadFile)}
|
||||||
|
|
||||||
status_code = httpx.codes.OK
|
|
||||||
|
|
||||||
if options.method.upper() == "DELETE" and result is None:
|
|
||||||
status_code = httpx.codes.NO_CONTENT
|
|
||||||
|
|
||||||
if status_code == httpx.codes.NO_CONTENT:
|
|
||||||
json_content = ""
|
|
||||||
|
|
||||||
mock_response = httpx.Response(
|
mock_response = httpx.Response(
|
||||||
status_code=status_code,
|
status_code=httpx.codes.OK,
|
||||||
content=json_content.encode("utf-8"),
|
content=json_content.encode("utf-8"),
|
||||||
headers={
|
headers={
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
|
|
|
@ -6,15 +6,15 @@
|
||||||
|
|
||||||
import contextvars
|
import contextvars
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from contextlib import AbstractContextManager
|
from contextlib import AbstractContextManager
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from llama_stack.core.datatypes import User
|
from llama_stack.core.datatypes import User
|
||||||
from llama_stack.log import get_logger
|
|
||||||
|
|
||||||
from .utils.dynamic import instantiate_class_type
|
from .utils.dynamic import instantiate_class_type
|
||||||
|
|
||||||
log = get_logger(name=__name__, category="core")
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Context variable for request provider data and auth attributes
|
# Context variable for request provider data and auth attributes
|
||||||
PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
|
PROVIDER_DATA_VAR = contextvars.ContextVar("provider_data", default=None)
|
||||||
|
|
|
@ -8,7 +8,6 @@ import inspect
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from llama_stack.apis.agents import Agents
|
from llama_stack.apis.agents import Agents
|
||||||
from llama_stack.apis.batches import Batches
|
|
||||||
from llama_stack.apis.benchmarks import Benchmarks
|
from llama_stack.apis.benchmarks import Benchmarks
|
||||||
from llama_stack.apis.datasetio import DatasetIO
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Datasets
|
from llama_stack.apis.datasets import Datasets
|
||||||
|
@ -76,7 +75,6 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
|
||||||
Api.agents: Agents,
|
Api.agents: Agents,
|
||||||
Api.inference: Inference,
|
Api.inference: Inference,
|
||||||
Api.inspect: Inspect,
|
Api.inspect: Inspect,
|
||||||
Api.batches: Batches,
|
|
||||||
Api.vector_io: VectorIO,
|
Api.vector_io: VectorIO,
|
||||||
Api.vector_dbs: VectorDBs,
|
Api.vector_dbs: VectorDBs,
|
||||||
Api.models: Models,
|
Api.models: Models,
|
||||||
|
|
|
@ -7,7 +7,6 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from collections.abc import AsyncGenerator, AsyncIterator
|
from collections.abc import AsyncGenerator, AsyncIterator
|
||||||
from datetime import UTC, datetime
|
|
||||||
from typing import Annotated, Any
|
from typing import Annotated, Any
|
||||||
|
|
||||||
from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam
|
from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatCompletionToolChoiceOptionParam
|
||||||
|
@ -18,7 +17,7 @@ from llama_stack.apis.common.content_types import (
|
||||||
InterleavedContent,
|
InterleavedContent,
|
||||||
InterleavedContentItem,
|
InterleavedContentItem,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError
|
from llama_stack.apis.common.errors import ModelNotFoundError
|
||||||
from llama_stack.apis.inference import (
|
from llama_stack.apis.inference import (
|
||||||
BatchChatCompletionResponse,
|
BatchChatCompletionResponse,
|
||||||
BatchCompletionResponse,
|
BatchCompletionResponse,
|
||||||
|
@ -26,21 +25,14 @@ from llama_stack.apis.inference import (
|
||||||
ChatCompletionResponseEventType,
|
ChatCompletionResponseEventType,
|
||||||
ChatCompletionResponseStreamChunk,
|
ChatCompletionResponseStreamChunk,
|
||||||
CompletionMessage,
|
CompletionMessage,
|
||||||
CompletionResponse,
|
|
||||||
CompletionResponseStreamChunk,
|
|
||||||
EmbeddingsResponse,
|
EmbeddingsResponse,
|
||||||
EmbeddingTaskType,
|
EmbeddingTaskType,
|
||||||
Inference,
|
Inference,
|
||||||
ListOpenAIChatCompletionResponse,
|
ListOpenAIChatCompletionResponse,
|
||||||
LogProbConfig,
|
LogProbConfig,
|
||||||
Message,
|
Message,
|
||||||
OpenAIAssistantMessageParam,
|
|
||||||
OpenAIChatCompletion,
|
OpenAIChatCompletion,
|
||||||
OpenAIChatCompletionChunk,
|
OpenAIChatCompletionChunk,
|
||||||
OpenAIChatCompletionToolCall,
|
|
||||||
OpenAIChatCompletionToolCallFunction,
|
|
||||||
OpenAIChoice,
|
|
||||||
OpenAIChoiceLogprobs,
|
|
||||||
OpenAICompletion,
|
OpenAICompletion,
|
||||||
OpenAICompletionWithInputMessages,
|
OpenAICompletionWithInputMessages,
|
||||||
OpenAIEmbeddingsResponse,
|
OpenAIEmbeddingsResponse,
|
||||||
|
@ -63,9 +55,10 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
|
||||||
from llama_stack.models.llama.llama3.tokenizer import Tokenizer
|
from llama_stack.models.llama.llama3.tokenizer import Tokenizer
|
||||||
from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
|
from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
|
||||||
from llama_stack.providers.utils.inference.inference_store import InferenceStore
|
from llama_stack.providers.utils.inference.inference_store import InferenceStore
|
||||||
|
from llama_stack.providers.utils.inference.stream_utils import stream_and_store_openai_completion
|
||||||
from llama_stack.providers.utils.telemetry.tracing import get_current_span
|
from llama_stack.providers.utils.telemetry.tracing import get_current_span
|
||||||
|
|
||||||
logger = get_logger(name=__name__, category="inference")
|
logger = get_logger(name=__name__, category="core")
|
||||||
|
|
||||||
|
|
||||||
class InferenceRouter(Inference):
|
class InferenceRouter(Inference):
|
||||||
|
@ -126,7 +119,6 @@ class InferenceRouter(Inference):
|
||||||
if span is None:
|
if span is None:
|
||||||
logger.warning("No span found for token usage metrics")
|
logger.warning("No span found for token usage metrics")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
metrics = [
|
metrics = [
|
||||||
("prompt_tokens", prompt_tokens),
|
("prompt_tokens", prompt_tokens),
|
||||||
("completion_tokens", completion_tokens),
|
("completion_tokens", completion_tokens),
|
||||||
|
@ -140,7 +132,7 @@ class InferenceRouter(Inference):
|
||||||
span_id=span.span_id,
|
span_id=span.span_id,
|
||||||
metric=metric_name,
|
metric=metric_name,
|
||||||
value=value,
|
value=value,
|
||||||
timestamp=datetime.now(UTC),
|
timestamp=time.time(),
|
||||||
unit="tokens",
|
unit="tokens",
|
||||||
attributes={
|
attributes={
|
||||||
"model_id": model.model_id,
|
"model_id": model.model_id,
|
||||||
|
@ -177,15 +169,6 @@ class InferenceRouter(Inference):
|
||||||
encoded = self.formatter.encode_content(messages)
|
encoded = self.formatter.encode_content(messages)
|
||||||
return len(encoded.tokens) if encoded and encoded.tokens else 0
|
return len(encoded.tokens) if encoded and encoded.tokens else 0
|
||||||
|
|
||||||
async def _get_model(self, model_id: str, expected_model_type: str) -> Model:
|
|
||||||
"""takes a model id and gets model after ensuring that it is accessible and of the correct type"""
|
|
||||||
model = await self.routing_table.get_model(model_id)
|
|
||||||
if model is None:
|
|
||||||
raise ModelNotFoundError(model_id)
|
|
||||||
if model.model_type != expected_model_type:
|
|
||||||
raise ModelTypeError(model_id, model.model_type, expected_model_type)
|
|
||||||
return model
|
|
||||||
|
|
||||||
async def chat_completion(
|
async def chat_completion(
|
||||||
self,
|
self,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
|
@ -204,7 +187,11 @@ class InferenceRouter(Inference):
|
||||||
)
|
)
|
||||||
if sampling_params is None:
|
if sampling_params is None:
|
||||||
sampling_params = SamplingParams()
|
sampling_params = SamplingParams()
|
||||||
model = await self._get_model(model_id, ModelType.llm)
|
model = await self.routing_table.get_model(model_id)
|
||||||
|
if model is None:
|
||||||
|
raise ModelNotFoundError(model_id)
|
||||||
|
if model.model_type == ModelType.embedding:
|
||||||
|
raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
|
||||||
if tool_config:
|
if tool_config:
|
||||||
if tool_choice and tool_choice != tool_config.tool_choice:
|
if tool_choice and tool_choice != tool_config.tool_choice:
|
||||||
raise ValueError("tool_choice and tool_config.tool_choice must match")
|
raise ValueError("tool_choice and tool_config.tool_choice must match")
|
||||||
|
@ -247,26 +234,49 @@ class InferenceRouter(Inference):
|
||||||
prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format)
|
prompt_tokens = await self._count_tokens(messages, tool_config.tool_prompt_format)
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
response_stream = await provider.chat_completion(**params)
|
|
||||||
return self.stream_tokens_and_compute_metrics(
|
|
||||||
response=response_stream,
|
|
||||||
prompt_tokens=prompt_tokens,
|
|
||||||
model=model,
|
|
||||||
tool_prompt_format=tool_config.tool_prompt_format,
|
|
||||||
)
|
|
||||||
|
|
||||||
response = await provider.chat_completion(**params)
|
async def stream_generator():
|
||||||
metrics = await self.count_tokens_and_compute_metrics(
|
completion_text = ""
|
||||||
response=response,
|
async for chunk in await provider.chat_completion(**params):
|
||||||
prompt_tokens=prompt_tokens,
|
if chunk.event.event_type == ChatCompletionResponseEventType.progress:
|
||||||
model=model,
|
if chunk.event.delta.type == "text":
|
||||||
tool_prompt_format=tool_config.tool_prompt_format,
|
completion_text += chunk.event.delta.text
|
||||||
)
|
if chunk.event.event_type == ChatCompletionResponseEventType.complete:
|
||||||
# these metrics will show up in the client response.
|
completion_tokens = await self._count_tokens(
|
||||||
response.metrics = (
|
[
|
||||||
metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
|
CompletionMessage(
|
||||||
)
|
content=completion_text,
|
||||||
return response
|
stop_reason=StopReason.end_of_turn,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
tool_config.tool_prompt_format,
|
||||||
|
)
|
||||||
|
total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
|
||||||
|
metrics = await self._compute_and_log_token_usage(
|
||||||
|
prompt_tokens or 0,
|
||||||
|
completion_tokens or 0,
|
||||||
|
total_tokens,
|
||||||
|
model,
|
||||||
|
)
|
||||||
|
chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
return stream_generator()
|
||||||
|
else:
|
||||||
|
response = await provider.chat_completion(**params)
|
||||||
|
completion_tokens = await self._count_tokens(
|
||||||
|
[response.completion_message],
|
||||||
|
tool_config.tool_prompt_format,
|
||||||
|
)
|
||||||
|
total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
|
||||||
|
metrics = await self._compute_and_log_token_usage(
|
||||||
|
prompt_tokens or 0,
|
||||||
|
completion_tokens or 0,
|
||||||
|
total_tokens,
|
||||||
|
model,
|
||||||
|
)
|
||||||
|
response.metrics = metrics if response.metrics is None else response.metrics + metrics
|
||||||
|
return response
|
||||||
|
|
||||||
async def batch_chat_completion(
|
async def batch_chat_completion(
|
||||||
self,
|
self,
|
||||||
|
@ -306,7 +316,11 @@ class InferenceRouter(Inference):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}",
|
f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}",
|
||||||
)
|
)
|
||||||
model = await self._get_model(model_id, ModelType.llm)
|
model = await self.routing_table.get_model(model_id)
|
||||||
|
if model is None:
|
||||||
|
raise ModelNotFoundError(model_id)
|
||||||
|
if model.model_type == ModelType.embedding:
|
||||||
|
raise ValueError(f"Model '{model_id}' is an embedding model and does not support chat completions")
|
||||||
provider = await self.routing_table.get_provider_impl(model_id)
|
provider = await self.routing_table.get_provider_impl(model_id)
|
||||||
params = dict(
|
params = dict(
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
|
@ -318,20 +332,39 @@ class InferenceRouter(Inference):
|
||||||
)
|
)
|
||||||
|
|
||||||
prompt_tokens = await self._count_tokens(content)
|
prompt_tokens = await self._count_tokens(content)
|
||||||
response = await provider.completion(**params)
|
|
||||||
if stream:
|
if stream:
|
||||||
return self.stream_tokens_and_compute_metrics(
|
|
||||||
response=response,
|
async def stream_generator():
|
||||||
prompt_tokens=prompt_tokens,
|
completion_text = ""
|
||||||
model=model,
|
async for chunk in await provider.completion(**params):
|
||||||
|
if hasattr(chunk, "delta"):
|
||||||
|
completion_text += chunk.delta
|
||||||
|
if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
|
||||||
|
completion_tokens = await self._count_tokens(completion_text)
|
||||||
|
total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
|
||||||
|
metrics = await self._compute_and_log_token_usage(
|
||||||
|
prompt_tokens or 0,
|
||||||
|
completion_tokens or 0,
|
||||||
|
total_tokens,
|
||||||
|
model,
|
||||||
|
)
|
||||||
|
chunk.metrics = metrics if chunk.metrics is None else chunk.metrics + metrics
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
return stream_generator()
|
||||||
|
else:
|
||||||
|
response = await provider.completion(**params)
|
||||||
|
completion_tokens = await self._count_tokens(response.content)
|
||||||
|
total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
|
||||||
|
metrics = await self._compute_and_log_token_usage(
|
||||||
|
prompt_tokens or 0,
|
||||||
|
completion_tokens or 0,
|
||||||
|
total_tokens,
|
||||||
|
model,
|
||||||
)
|
)
|
||||||
|
response.metrics = metrics if response.metrics is None else response.metrics + metrics
|
||||||
metrics = await self.count_tokens_and_compute_metrics(
|
return response
|
||||||
response=response, prompt_tokens=prompt_tokens, model=model
|
|
||||||
)
|
|
||||||
response.metrics = metrics if response.metrics is None else response.metrics + metrics
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
async def batch_completion(
|
async def batch_completion(
|
||||||
self,
|
self,
|
||||||
|
@ -356,7 +389,11 @@ class InferenceRouter(Inference):
|
||||||
task_type: EmbeddingTaskType | None = None,
|
task_type: EmbeddingTaskType | None = None,
|
||||||
) -> EmbeddingsResponse:
|
) -> EmbeddingsResponse:
|
||||||
logger.debug(f"InferenceRouter.embeddings: {model_id}")
|
logger.debug(f"InferenceRouter.embeddings: {model_id}")
|
||||||
await self._get_model(model_id, ModelType.embedding)
|
model = await self.routing_table.get_model(model_id)
|
||||||
|
if model is None:
|
||||||
|
raise ModelNotFoundError(model_id)
|
||||||
|
if model.model_type == ModelType.llm:
|
||||||
|
raise ValueError(f"Model '{model_id}' is an LLM model and does not support embeddings")
|
||||||
provider = await self.routing_table.get_provider_impl(model_id)
|
provider = await self.routing_table.get_provider_impl(model_id)
|
||||||
return await provider.embeddings(
|
return await provider.embeddings(
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
|
@ -392,7 +429,12 @@ class InferenceRouter(Inference):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
|
f"InferenceRouter.openai_completion: {model=}, {stream=}, {prompt=}",
|
||||||
)
|
)
|
||||||
model_obj = await self._get_model(model, ModelType.llm)
|
model_obj = await self.routing_table.get_model(model)
|
||||||
|
if model_obj is None:
|
||||||
|
raise ModelNotFoundError(model)
|
||||||
|
if model_obj.model_type == ModelType.embedding:
|
||||||
|
raise ValueError(f"Model '{model}' is an embedding model and does not support completions")
|
||||||
|
|
||||||
params = dict(
|
params = dict(
|
||||||
model=model_obj.identifier,
|
model=model_obj.identifier,
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
|
@ -415,29 +457,9 @@ class InferenceRouter(Inference):
|
||||||
prompt_logprobs=prompt_logprobs,
|
prompt_logprobs=prompt_logprobs,
|
||||||
suffix=suffix,
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
|
|
||||||
provider = await self.routing_table.get_provider_impl(model_obj.identifier)
|
provider = await self.routing_table.get_provider_impl(model_obj.identifier)
|
||||||
if stream:
|
return await provider.openai_completion(**params)
|
||||||
return await provider.openai_completion(**params)
|
|
||||||
# TODO: Metrics do NOT work with openai_completion stream=True due to the fact
|
|
||||||
# that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.
|
|
||||||
# response_stream = await provider.openai_completion(**params)
|
|
||||||
|
|
||||||
response = await provider.openai_completion(**params)
|
|
||||||
if self.telemetry:
|
|
||||||
metrics = self._construct_metrics(
|
|
||||||
prompt_tokens=response.usage.prompt_tokens,
|
|
||||||
completion_tokens=response.usage.completion_tokens,
|
|
||||||
total_tokens=response.usage.total_tokens,
|
|
||||||
model=model_obj,
|
|
||||||
)
|
|
||||||
for metric in metrics:
|
|
||||||
await self.telemetry.log_event(metric)
|
|
||||||
|
|
||||||
# these metrics will show up in the client response.
|
|
||||||
response.metrics = (
|
|
||||||
metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
|
|
||||||
async def openai_chat_completion(
|
async def openai_chat_completion(
|
||||||
self,
|
self,
|
||||||
|
@ -468,7 +490,11 @@ class InferenceRouter(Inference):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
|
f"InferenceRouter.openai_chat_completion: {model=}, {stream=}, {messages=}",
|
||||||
)
|
)
|
||||||
model_obj = await self._get_model(model, ModelType.llm)
|
model_obj = await self.routing_table.get_model(model)
|
||||||
|
if model_obj is None:
|
||||||
|
raise ModelNotFoundError(model)
|
||||||
|
if model_obj.model_type == ModelType.embedding:
|
||||||
|
raise ValueError(f"Model '{model}' is an embedding model and does not support chat completions")
|
||||||
|
|
||||||
# Use the OpenAI client for a bit of extra input validation without
|
# Use the OpenAI client for a bit of extra input validation without
|
||||||
# exposing the OpenAI client itself as part of our API surface
|
# exposing the OpenAI client itself as part of our API surface
|
||||||
|
@ -511,38 +537,18 @@ class InferenceRouter(Inference):
|
||||||
top_p=top_p,
|
top_p=top_p,
|
||||||
user=user,
|
user=user,
|
||||||
)
|
)
|
||||||
|
|
||||||
provider = await self.routing_table.get_provider_impl(model_obj.identifier)
|
provider = await self.routing_table.get_provider_impl(model_obj.identifier)
|
||||||
if stream:
|
if stream:
|
||||||
response_stream = await provider.openai_chat_completion(**params)
|
response_stream = await provider.openai_chat_completion(**params)
|
||||||
|
if self.store:
|
||||||
# For streaming, the provider returns AsyncIterator[OpenAIChatCompletionChunk]
|
return stream_and_store_openai_completion(response_stream, model, self.store, messages)
|
||||||
# We need to add metrics to each chunk and store the final completion
|
return response_stream
|
||||||
return self.stream_tokens_and_compute_metrics_openai_chat(
|
else:
|
||||||
response=response_stream,
|
response = await self._nonstream_openai_chat_completion(provider, params)
|
||||||
model=model_obj,
|
if self.store:
|
||||||
messages=messages,
|
await self.store.store_chat_completion(response, messages)
|
||||||
)
|
return response
|
||||||
|
|
||||||
response = await self._nonstream_openai_chat_completion(provider, params)
|
|
||||||
|
|
||||||
# Store the response with the ID that will be returned to the client
|
|
||||||
if self.store:
|
|
||||||
await self.store.store_chat_completion(response, messages)
|
|
||||||
|
|
||||||
if self.telemetry:
|
|
||||||
metrics = self._construct_metrics(
|
|
||||||
prompt_tokens=response.usage.prompt_tokens,
|
|
||||||
completion_tokens=response.usage.completion_tokens,
|
|
||||||
total_tokens=response.usage.total_tokens,
|
|
||||||
model=model_obj,
|
|
||||||
)
|
|
||||||
for metric in metrics:
|
|
||||||
await self.telemetry.log_event(metric)
|
|
||||||
# these metrics will show up in the client response.
|
|
||||||
response.metrics = (
|
|
||||||
metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
|
|
||||||
async def openai_embeddings(
|
async def openai_embeddings(
|
||||||
self,
|
self,
|
||||||
|
@ -555,7 +561,12 @@ class InferenceRouter(Inference):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}",
|
f"InferenceRouter.openai_embeddings: {model=}, input_type={type(input)}, {encoding_format=}, {dimensions=}",
|
||||||
)
|
)
|
||||||
model_obj = await self._get_model(model, ModelType.embedding)
|
model_obj = await self.routing_table.get_model(model)
|
||||||
|
if model_obj is None:
|
||||||
|
raise ModelNotFoundError(model)
|
||||||
|
if model_obj.model_type != ModelType.embedding:
|
||||||
|
raise ValueError(f"Model '{model}' is not an embedding model")
|
||||||
|
|
||||||
params = dict(
|
params = dict(
|
||||||
model=model_obj.identifier,
|
model=model_obj.identifier,
|
||||||
input=input,
|
input=input,
|
||||||
|
@ -614,245 +625,3 @@ class InferenceRouter(Inference):
|
||||||
status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}"
|
status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}"
|
||||||
)
|
)
|
||||||
return health_statuses
|
return health_statuses
|
||||||
|
|
||||||
async def stream_tokens_and_compute_metrics(
|
|
||||||
self,
|
|
||||||
response,
|
|
||||||
prompt_tokens,
|
|
||||||
model,
|
|
||||||
tool_prompt_format: ToolPromptFormat | None = None,
|
|
||||||
) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None] | AsyncGenerator[CompletionResponseStreamChunk, None]:
|
|
||||||
completion_text = ""
|
|
||||||
async for chunk in response:
|
|
||||||
complete = False
|
|
||||||
if hasattr(chunk, "event"): # only ChatCompletions have .event
|
|
||||||
if chunk.event.event_type == ChatCompletionResponseEventType.progress:
|
|
||||||
if chunk.event.delta.type == "text":
|
|
||||||
completion_text += chunk.event.delta.text
|
|
||||||
if chunk.event.event_type == ChatCompletionResponseEventType.complete:
|
|
||||||
complete = True
|
|
||||||
completion_tokens = await self._count_tokens(
|
|
||||||
[
|
|
||||||
CompletionMessage(
|
|
||||||
content=completion_text,
|
|
||||||
stop_reason=StopReason.end_of_turn,
|
|
||||||
)
|
|
||||||
],
|
|
||||||
tool_prompt_format=tool_prompt_format,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
if hasattr(chunk, "delta"):
|
|
||||||
completion_text += chunk.delta
|
|
||||||
if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
|
|
||||||
complete = True
|
|
||||||
completion_tokens = await self._count_tokens(completion_text)
|
|
||||||
# if we are done receiving tokens
|
|
||||||
if complete:
|
|
||||||
total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
|
|
||||||
|
|
||||||
# Create a separate span for streaming completion metrics
|
|
||||||
if self.telemetry:
|
|
||||||
# Log metrics in the new span context
|
|
||||||
completion_metrics = self._construct_metrics(
|
|
||||||
prompt_tokens=prompt_tokens,
|
|
||||||
completion_tokens=completion_tokens,
|
|
||||||
total_tokens=total_tokens,
|
|
||||||
model=model,
|
|
||||||
)
|
|
||||||
for metric in completion_metrics:
|
|
||||||
if metric.metric in [
|
|
||||||
"completion_tokens",
|
|
||||||
"total_tokens",
|
|
||||||
]: # Only log completion and total tokens
|
|
||||||
await self.telemetry.log_event(metric)
|
|
||||||
|
|
||||||
# Return metrics in response
|
|
||||||
async_metrics = [
|
|
||||||
MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
|
|
||||||
]
|
|
||||||
chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
|
|
||||||
else:
|
|
||||||
# Fallback if no telemetry
|
|
||||||
completion_metrics = self._construct_metrics(
|
|
||||||
prompt_tokens or 0,
|
|
||||||
completion_tokens or 0,
|
|
||||||
total_tokens,
|
|
||||||
model,
|
|
||||||
)
|
|
||||||
async_metrics = [
|
|
||||||
MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
|
|
||||||
]
|
|
||||||
chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
|
|
||||||
yield chunk
|
|
||||||
|
|
||||||
async def count_tokens_and_compute_metrics(
|
|
||||||
self,
|
|
||||||
response: ChatCompletionResponse | CompletionResponse,
|
|
||||||
prompt_tokens,
|
|
||||||
model,
|
|
||||||
tool_prompt_format: ToolPromptFormat | None = None,
|
|
||||||
):
|
|
||||||
if isinstance(response, ChatCompletionResponse):
|
|
||||||
content = [response.completion_message]
|
|
||||||
else:
|
|
||||||
content = response.content
|
|
||||||
completion_tokens = await self._count_tokens(messages=content, tool_prompt_format=tool_prompt_format)
|
|
||||||
total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
|
|
||||||
|
|
||||||
# Create a separate span for completion metrics
|
|
||||||
if self.telemetry:
|
|
||||||
# Log metrics in the new span context
|
|
||||||
completion_metrics = self._construct_metrics(
|
|
||||||
prompt_tokens=prompt_tokens,
|
|
||||||
completion_tokens=completion_tokens,
|
|
||||||
total_tokens=total_tokens,
|
|
||||||
model=model,
|
|
||||||
)
|
|
||||||
for metric in completion_metrics:
|
|
||||||
if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens
|
|
||||||
await self.telemetry.log_event(metric)
|
|
||||||
|
|
||||||
# Return metrics in response
|
|
||||||
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
|
|
||||||
|
|
||||||
# Fallback if no telemetry
|
|
||||||
metrics = self._construct_metrics(
|
|
||||||
prompt_tokens or 0,
|
|
||||||
completion_tokens or 0,
|
|
||||||
total_tokens,
|
|
||||||
model,
|
|
||||||
)
|
|
||||||
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
|
|
||||||
|
|
||||||
async def stream_tokens_and_compute_metrics_openai_chat(
|
|
||||||
self,
|
|
||||||
response: AsyncIterator[OpenAIChatCompletionChunk],
|
|
||||||
model: Model,
|
|
||||||
messages: list[OpenAIMessageParam] | None = None,
|
|
||||||
) -> AsyncIterator[OpenAIChatCompletionChunk]:
|
|
||||||
"""Stream OpenAI chat completion chunks, compute metrics, and store the final completion."""
|
|
||||||
id = None
|
|
||||||
created = None
|
|
||||||
choices_data: dict[int, dict[str, Any]] = {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
async for chunk in response:
|
|
||||||
# Skip None chunks
|
|
||||||
if chunk is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Capture ID and created timestamp from first chunk
|
|
||||||
if id is None and chunk.id:
|
|
||||||
id = chunk.id
|
|
||||||
if created is None and chunk.created:
|
|
||||||
created = chunk.created
|
|
||||||
|
|
||||||
# Accumulate choice data for final assembly
|
|
||||||
if chunk.choices:
|
|
||||||
for choice_delta in chunk.choices:
|
|
||||||
idx = choice_delta.index
|
|
||||||
if idx not in choices_data:
|
|
||||||
choices_data[idx] = {
|
|
||||||
"content_parts": [],
|
|
||||||
"tool_calls_builder": {},
|
|
||||||
"finish_reason": None,
|
|
||||||
"logprobs_content_parts": [],
|
|
||||||
}
|
|
||||||
current_choice_data = choices_data[idx]
|
|
||||||
|
|
||||||
if choice_delta.delta:
|
|
||||||
delta = choice_delta.delta
|
|
||||||
if delta.content:
|
|
||||||
current_choice_data["content_parts"].append(delta.content)
|
|
||||||
if delta.tool_calls:
|
|
||||||
for tool_call_delta in delta.tool_calls:
|
|
||||||
tc_idx = tool_call_delta.index
|
|
||||||
if tc_idx not in current_choice_data["tool_calls_builder"]:
|
|
||||||
current_choice_data["tool_calls_builder"][tc_idx] = {
|
|
||||||
"id": None,
|
|
||||||
"type": "function",
|
|
||||||
"function_name_parts": [],
|
|
||||||
"function_arguments_parts": [],
|
|
||||||
}
|
|
||||||
builder = current_choice_data["tool_calls_builder"][tc_idx]
|
|
||||||
if tool_call_delta.id:
|
|
||||||
builder["id"] = tool_call_delta.id
|
|
||||||
if tool_call_delta.type:
|
|
||||||
builder["type"] = tool_call_delta.type
|
|
||||||
if tool_call_delta.function:
|
|
||||||
if tool_call_delta.function.name:
|
|
||||||
builder["function_name_parts"].append(tool_call_delta.function.name)
|
|
||||||
if tool_call_delta.function.arguments:
|
|
||||||
builder["function_arguments_parts"].append(
|
|
||||||
tool_call_delta.function.arguments
|
|
||||||
)
|
|
||||||
if choice_delta.finish_reason:
|
|
||||||
current_choice_data["finish_reason"] = choice_delta.finish_reason
|
|
||||||
if choice_delta.logprobs and choice_delta.logprobs.content:
|
|
||||||
current_choice_data["logprobs_content_parts"].extend(choice_delta.logprobs.content)
|
|
||||||
|
|
||||||
# Compute metrics on final chunk
|
|
||||||
if chunk.choices and chunk.choices[0].finish_reason:
|
|
||||||
completion_text = ""
|
|
||||||
for choice_data in choices_data.values():
|
|
||||||
completion_text += "".join(choice_data["content_parts"])
|
|
||||||
|
|
||||||
# Add metrics to the chunk
|
|
||||||
if self.telemetry and chunk.usage:
|
|
||||||
metrics = self._construct_metrics(
|
|
||||||
prompt_tokens=chunk.usage.prompt_tokens,
|
|
||||||
completion_tokens=chunk.usage.completion_tokens,
|
|
||||||
total_tokens=chunk.usage.total_tokens,
|
|
||||||
model=model,
|
|
||||||
)
|
|
||||||
for metric in metrics:
|
|
||||||
await self.telemetry.log_event(metric)
|
|
||||||
|
|
||||||
yield chunk
|
|
||||||
finally:
|
|
||||||
# Store the final assembled completion
|
|
||||||
if id and self.store and messages:
|
|
||||||
assembled_choices: list[OpenAIChoice] = []
|
|
||||||
for choice_idx, choice_data in choices_data.items():
|
|
||||||
content_str = "".join(choice_data["content_parts"])
|
|
||||||
assembled_tool_calls: list[OpenAIChatCompletionToolCall] = []
|
|
||||||
if choice_data["tool_calls_builder"]:
|
|
||||||
for tc_build_data in choice_data["tool_calls_builder"].values():
|
|
||||||
if tc_build_data["id"]:
|
|
||||||
func_name = "".join(tc_build_data["function_name_parts"])
|
|
||||||
func_args = "".join(tc_build_data["function_arguments_parts"])
|
|
||||||
assembled_tool_calls.append(
|
|
||||||
OpenAIChatCompletionToolCall(
|
|
||||||
id=tc_build_data["id"],
|
|
||||||
type=tc_build_data["type"],
|
|
||||||
function=OpenAIChatCompletionToolCallFunction(
|
|
||||||
name=func_name, arguments=func_args
|
|
||||||
),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
message = OpenAIAssistantMessageParam(
|
|
||||||
role="assistant",
|
|
||||||
content=content_str if content_str else None,
|
|
||||||
tool_calls=assembled_tool_calls if assembled_tool_calls else None,
|
|
||||||
)
|
|
||||||
logprobs_content = choice_data["logprobs_content_parts"]
|
|
||||||
final_logprobs = OpenAIChoiceLogprobs(content=logprobs_content) if logprobs_content else None
|
|
||||||
|
|
||||||
assembled_choices.append(
|
|
||||||
OpenAIChoice(
|
|
||||||
finish_reason=choice_data["finish_reason"],
|
|
||||||
index=choice_idx,
|
|
||||||
message=message,
|
|
||||||
logprobs=final_logprobs,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
final_response = OpenAIChatCompletion(
|
|
||||||
id=id,
|
|
||||||
choices=assembled_choices,
|
|
||||||
created=created or int(time.time()),
|
|
||||||
model=model.identifier,
|
|
||||||
object="chat.completion",
|
|
||||||
)
|
|
||||||
logger.debug(f"InferenceRouter.completion_response: {final_response}")
|
|
||||||
await self.store.store_chat_completion(final_response, messages)
|
|
||||||
|
|
|
@ -6,9 +6,10 @@
|
||||||
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from llama_stack.apis.inference import Message
|
from llama_stack.apis.inference import (
|
||||||
|
Message,
|
||||||
|
)
|
||||||
from llama_stack.apis.safety import RunShieldResponse, Safety
|
from llama_stack.apis.safety import RunShieldResponse, Safety
|
||||||
from llama_stack.apis.safety.safety import ModerationObject
|
|
||||||
from llama_stack.apis.shields import Shield
|
from llama_stack.apis.shields import Shield
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.providers.datatypes import RoutingTable
|
from llama_stack.providers.datatypes import RoutingTable
|
||||||
|
@ -42,10 +43,6 @@ class SafetyRouter(Safety):
|
||||||
logger.debug(f"SafetyRouter.register_shield: {shield_id}")
|
logger.debug(f"SafetyRouter.register_shield: {shield_id}")
|
||||||
return await self.routing_table.register_shield(shield_id, provider_shield_id, provider_id, params)
|
return await self.routing_table.register_shield(shield_id, provider_shield_id, provider_id, params)
|
||||||
|
|
||||||
async def unregister_shield(self, identifier: str) -> None:
|
|
||||||
logger.debug(f"SafetyRouter.unregister_shield: {identifier}")
|
|
||||||
return await self.routing_table.unregister_shield(identifier)
|
|
||||||
|
|
||||||
async def run_shield(
|
async def run_shield(
|
||||||
self,
|
self,
|
||||||
shield_id: str,
|
shield_id: str,
|
||||||
|
@ -59,27 +56,3 @@ class SafetyRouter(Safety):
|
||||||
messages=messages,
|
messages=messages,
|
||||||
params=params,
|
params=params,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
|
|
||||||
async def get_shield_id(self, model: str) -> str:
|
|
||||||
"""Get Shield id from model (provider_resource_id) of shield."""
|
|
||||||
list_shields_response = await self.routing_table.list_shields()
|
|
||||||
|
|
||||||
matches = [s.identifier for s in list_shields_response.data if model == s.provider_resource_id]
|
|
||||||
|
|
||||||
if not matches:
|
|
||||||
raise ValueError(f"No shield associated with provider_resource id {model}")
|
|
||||||
if len(matches) > 1:
|
|
||||||
raise ValueError(f"Multiple shields associated with provider_resource id {model}")
|
|
||||||
return matches[0]
|
|
||||||
|
|
||||||
shield_id = await get_shield_id(self, model)
|
|
||||||
logger.debug(f"SafetyRouter.run_moderation: {shield_id}")
|
|
||||||
provider = await self.routing_table.get_provider_impl(shield_id)
|
|
||||||
|
|
||||||
response = await provider.run_moderation(
|
|
||||||
input=input,
|
|
||||||
model=model,
|
|
||||||
)
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
|
@ -60,8 +60,6 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
|
||||||
return await p.unregister_vector_db(obj.identifier)
|
return await p.unregister_vector_db(obj.identifier)
|
||||||
elif api == Api.inference:
|
elif api == Api.inference:
|
||||||
return await p.unregister_model(obj.identifier)
|
return await p.unregister_model(obj.identifier)
|
||||||
elif api == Api.safety:
|
|
||||||
return await p.unregister_shield(obj.identifier)
|
|
||||||
elif api == Api.datasetio:
|
elif api == Api.datasetio:
|
||||||
return await p.unregister_dataset(obj.identifier)
|
return await p.unregister_dataset(obj.identifier)
|
||||||
elif api == Api.tool_runtime:
|
elif api == Api.tool_runtime:
|
||||||
|
|
|
@ -63,8 +63,6 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
|
||||||
|
|
||||||
async def get_provider_impl(self, model_id: str) -> Any:
|
async def get_provider_impl(self, model_id: str) -> Any:
|
||||||
model = await lookup_model(self, model_id)
|
model = await lookup_model(self, model_id)
|
||||||
if model.provider_id not in self.impls_by_provider_id:
|
|
||||||
raise ValueError(f"Provider {model.provider_id} not found in the routing table")
|
|
||||||
return self.impls_by_provider_id[model.provider_id]
|
return self.impls_by_provider_id[model.provider_id]
|
||||||
|
|
||||||
async def register_model(
|
async def register_model(
|
||||||
|
|
|
@ -55,7 +55,3 @@ class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
|
||||||
)
|
)
|
||||||
await self.register_object(shield)
|
await self.register_object(shield)
|
||||||
return shield
|
return shield
|
||||||
|
|
||||||
async def unregister_shield(self, identifier: str) -> None:
|
|
||||||
existing_shield = await self.get_shield(identifier)
|
|
||||||
await self.unregister_object(existing_shield)
|
|
||||||
|
|
|
@ -124,7 +124,10 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
||||||
return toolgroup
|
return toolgroup
|
||||||
|
|
||||||
async def unregister_toolgroup(self, toolgroup_id: str) -> None:
|
async def unregister_toolgroup(self, toolgroup_id: str) -> None:
|
||||||
await self.unregister_object(await self.get_tool_group(toolgroup_id))
|
tool_group = await self.get_tool_group(toolgroup_id)
|
||||||
|
if tool_group is None:
|
||||||
|
raise ToolGroupNotFoundError(toolgroup_id)
|
||||||
|
await self.unregister_object(tool_group)
|
||||||
|
|
||||||
async def shutdown(self) -> None:
|
async def shutdown(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -8,7 +8,7 @@ from typing import Any
|
||||||
|
|
||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
|
|
||||||
from llama_stack.apis.common.errors import ModelNotFoundError, ModelTypeError, VectorStoreNotFoundError
|
from llama_stack.apis.common.errors import ModelNotFoundError, VectorStoreNotFoundError
|
||||||
from llama_stack.apis.models import ModelType
|
from llama_stack.apis.models import ModelType
|
||||||
from llama_stack.apis.resource import ResourceType
|
from llama_stack.apis.resource import ResourceType
|
||||||
from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
|
from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
|
||||||
|
@ -66,7 +66,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
|
||||||
if model is None:
|
if model is None:
|
||||||
raise ModelNotFoundError(embedding_model)
|
raise ModelNotFoundError(embedding_model)
|
||||||
if model.model_type != ModelType.embedding:
|
if model.model_type != ModelType.embedding:
|
||||||
raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
|
raise ValueError(f"Model {embedding_model} is not an embedding model")
|
||||||
if "embedding_dimension" not in model.metadata:
|
if "embedding_dimension" not in model.metadata:
|
||||||
raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
|
raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
|
||||||
vector_db_data = {
|
vector_db_data = {
|
||||||
|
|
|
@ -9,7 +9,7 @@ import asyncio
|
||||||
import functools
|
import functools
|
||||||
import inspect
|
import inspect
|
||||||
import json
|
import json
|
||||||
import logging # allow-direct-logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import ssl
|
import ssl
|
||||||
import sys
|
import sys
|
||||||
|
@ -21,18 +21,16 @@ from importlib.metadata import version as parse_version
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Any, get_origin
|
from typing import Annotated, Any, get_origin
|
||||||
|
|
||||||
import httpx
|
|
||||||
import rich.pretty
|
import rich.pretty
|
||||||
import yaml
|
import yaml
|
||||||
from aiohttp import hdrs
|
from aiohttp import hdrs
|
||||||
from fastapi import Body, FastAPI, HTTPException, Request, Response
|
from fastapi import Body, FastAPI, HTTPException, Request
|
||||||
from fastapi import Path as FastapiPath
|
from fastapi import Path as FastapiPath
|
||||||
from fastapi.exceptions import RequestValidationError
|
from fastapi.exceptions import RequestValidationError
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from openai import BadRequestError
|
from openai import BadRequestError
|
||||||
from pydantic import BaseModel, ValidationError
|
from pydantic import BaseModel, ValidationError
|
||||||
|
|
||||||
from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
|
|
||||||
from llama_stack.apis.common.responses import PaginatedResponse
|
from llama_stack.apis.common.responses import PaginatedResponse
|
||||||
from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
|
from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
|
||||||
from llama_stack.core.access_control.access_control import AccessDeniedError
|
from llama_stack.core.access_control.access_control import AccessDeniedError
|
||||||
|
@ -117,7 +115,7 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
|
||||||
|
|
||||||
if isinstance(exc, RequestValidationError):
|
if isinstance(exc, RequestValidationError):
|
||||||
return HTTPException(
|
return HTTPException(
|
||||||
status_code=httpx.codes.BAD_REQUEST,
|
status_code=400,
|
||||||
detail={
|
detail={
|
||||||
"errors": [
|
"errors": [
|
||||||
{
|
{
|
||||||
|
@ -129,25 +127,21 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
elif isinstance(exc, ConflictError):
|
|
||||||
return HTTPException(status_code=409, detail=str(exc))
|
|
||||||
elif isinstance(exc, ResourceNotFoundError):
|
|
||||||
return HTTPException(status_code=404, detail=str(exc))
|
|
||||||
elif isinstance(exc, ValueError):
|
elif isinstance(exc, ValueError):
|
||||||
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
|
return HTTPException(status_code=400, detail=f"Invalid value: {str(exc)}")
|
||||||
elif isinstance(exc, BadRequestError):
|
elif isinstance(exc, BadRequestError):
|
||||||
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
|
return HTTPException(status_code=400, detail=str(exc))
|
||||||
elif isinstance(exc, PermissionError | AccessDeniedError):
|
elif isinstance(exc, PermissionError | AccessDeniedError):
|
||||||
return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
|
return HTTPException(status_code=403, detail=f"Permission denied: {str(exc)}")
|
||||||
elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
|
elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
|
||||||
return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
|
return HTTPException(status_code=504, detail=f"Operation timed out: {str(exc)}")
|
||||||
elif isinstance(exc, NotImplementedError):
|
elif isinstance(exc, NotImplementedError):
|
||||||
return HTTPException(status_code=httpx.codes.NOT_IMPLEMENTED, detail=f"Not implemented: {str(exc)}")
|
return HTTPException(status_code=501, detail=f"Not implemented: {str(exc)}")
|
||||||
elif isinstance(exc, AuthenticationRequiredError):
|
elif isinstance(exc, AuthenticationRequiredError):
|
||||||
return HTTPException(status_code=httpx.codes.UNAUTHORIZED, detail=f"Authentication required: {str(exc)}")
|
return HTTPException(status_code=401, detail=f"Authentication required: {str(exc)}")
|
||||||
else:
|
else:
|
||||||
return HTTPException(
|
return HTTPException(
|
||||||
status_code=httpx.codes.INTERNAL_SERVER_ERROR,
|
status_code=500,
|
||||||
detail="Internal server error: An unexpected error occurred.",
|
detail="Internal server error: An unexpected error occurred.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -186,6 +180,7 @@ async def sse_generator(event_gen_coroutine):
|
||||||
event_gen = await event_gen_coroutine
|
event_gen = await event_gen_coroutine
|
||||||
async for item in event_gen:
|
async for item in event_gen:
|
||||||
yield create_sse_event(item)
|
yield create_sse_event(item)
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
logger.info("Generator cancelled")
|
logger.info("Generator cancelled")
|
||||||
if event_gen:
|
if event_gen:
|
||||||
|
@ -241,10 +236,6 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
|
||||||
result = await maybe_await(value)
|
result = await maybe_await(value)
|
||||||
if isinstance(result, PaginatedResponse) and result.url is None:
|
if isinstance(result, PaginatedResponse) and result.url is None:
|
||||||
result.url = route
|
result.url = route
|
||||||
|
|
||||||
if method.upper() == "DELETE" and result is None:
|
|
||||||
return Response(status_code=httpx.codes.NO_CONTENT)
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if logger.isEnabledFor(logging.DEBUG):
|
if logger.isEnabledFor(logging.DEBUG):
|
||||||
|
@ -361,7 +352,7 @@ class ClientVersionMiddleware:
|
||||||
await send(
|
await send(
|
||||||
{
|
{
|
||||||
"type": "http.response.start",
|
"type": "http.response.start",
|
||||||
"status": httpx.codes.UPGRADE_REQUIRED,
|
"status": 426,
|
||||||
"headers": [[b"content-type", b"application/json"]],
|
"headers": [[b"content-type", b"application/json"]],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import importlib
|
import logging
|
||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
|
@ -12,9 +12,9 @@ import sys
|
||||||
|
|
||||||
from termcolor import cprint
|
from termcolor import cprint
|
||||||
|
|
||||||
from llama_stack.log import get_logger
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
log = get_logger(name=__name__, category="core")
|
import importlib
|
||||||
|
|
||||||
|
|
||||||
def formulate_run_args(image_type: str, image_name: str) -> list:
|
def formulate_run_args(image_type: str, image_name: str) -> list:
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue