mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-21 09:23:13 +00:00
Compare commits
2 commits
Author | SHA1 | Date | |
---|---|---|---|
|
62ed7b3531 | ||
|
7d4e082bdb |
780 changed files with 21795 additions and 82662 deletions
2
.github/TRIAGERS.md
vendored
2
.github/TRIAGERS.md
vendored
|
@ -1,2 +1,2 @@
|
||||||
# This file documents Triage members in the Llama Stack community
|
# This file documents Triage members in the Llama Stack community
|
||||||
@franciscojavierarceo
|
@bbrowning @franciscojavierarceo @leseb
|
||||||
|
|
88
.github/actions/run-and-record-tests/action.yml
vendored
88
.github/actions/run-and-record-tests/action.yml
vendored
|
@ -1,88 +0,0 @@
|
||||||
name: 'Run and Record Tests'
|
|
||||||
description: 'Run integration tests and handle recording/artifact upload'
|
|
||||||
|
|
||||||
inputs:
|
|
||||||
test-subdirs:
|
|
||||||
description: 'Comma-separated list of test subdirectories to run'
|
|
||||||
required: true
|
|
||||||
test-pattern:
|
|
||||||
description: 'Regex pattern to pass to pytest -k'
|
|
||||||
required: false
|
|
||||||
default: ''
|
|
||||||
stack-config:
|
|
||||||
description: 'Stack configuration to use'
|
|
||||||
required: true
|
|
||||||
provider:
|
|
||||||
description: 'Provider to use for tests'
|
|
||||||
required: true
|
|
||||||
inference-mode:
|
|
||||||
description: 'Inference mode (record or replay)'
|
|
||||||
required: true
|
|
||||||
run-vision-tests:
|
|
||||||
description: 'Whether to run vision tests'
|
|
||||||
required: false
|
|
||||||
default: 'false'
|
|
||||||
|
|
||||||
runs:
|
|
||||||
using: 'composite'
|
|
||||||
steps:
|
|
||||||
- name: Check Storage and Memory Available Before Tests
|
|
||||||
if: ${{ always() }}
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
free -h
|
|
||||||
df -h
|
|
||||||
|
|
||||||
- name: Run Integration Tests
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
uv run --no-sync ./scripts/integration-tests.sh \
|
|
||||||
--stack-config '${{ inputs.stack-config }}' \
|
|
||||||
--provider '${{ inputs.provider }}' \
|
|
||||||
--test-subdirs '${{ inputs.test-subdirs }}' \
|
|
||||||
--test-pattern '${{ inputs.test-pattern }}' \
|
|
||||||
--inference-mode '${{ inputs.inference-mode }}' \
|
|
||||||
${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \
|
|
||||||
| tee pytest-${{ inputs.inference-mode }}.log
|
|
||||||
|
|
||||||
|
|
||||||
- name: Commit and push recordings
|
|
||||||
if: ${{ inputs.inference-mode == 'record' }}
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "Checking for recording changes"
|
|
||||||
git status --porcelain tests/integration/recordings/
|
|
||||||
|
|
||||||
if [[ -n $(git status --porcelain tests/integration/recordings/) ]]; then
|
|
||||||
echo "New recordings detected, committing and pushing"
|
|
||||||
git add tests/integration/recordings/
|
|
||||||
|
|
||||||
if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
|
|
||||||
git commit -m "Recordings update from CI (vision)"
|
|
||||||
else
|
|
||||||
git commit -m "Recordings update from CI"
|
|
||||||
fi
|
|
||||||
|
|
||||||
git fetch origin ${{ github.ref_name }}
|
|
||||||
git rebase origin/${{ github.ref_name }}
|
|
||||||
echo "Rebased successfully"
|
|
||||||
git push origin HEAD:${{ github.ref_name }}
|
|
||||||
echo "Pushed successfully"
|
|
||||||
else
|
|
||||||
echo "No recording changes"
|
|
||||||
fi
|
|
||||||
|
|
||||||
- name: Write inference logs to file
|
|
||||||
if: ${{ always() }}
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
|
|
||||||
|
|
||||||
- name: Upload logs
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
|
||||||
with:
|
|
||||||
name: logs-${{ github.run_id }}-${{ github.run_attempt || '' }}-${{ strategy.job-index }}
|
|
||||||
path: |
|
|
||||||
*.log
|
|
||||||
retention-days: 1
|
|
14
.github/actions/setup-ollama/action.yml
vendored
14
.github/actions/setup-ollama/action.yml
vendored
|
@ -1,23 +1,11 @@
|
||||||
name: Setup Ollama
|
name: Setup Ollama
|
||||||
description: Start Ollama
|
description: Start Ollama
|
||||||
inputs:
|
|
||||||
run-vision-tests:
|
|
||||||
description: 'Run vision tests: "true" or "false"'
|
|
||||||
required: false
|
|
||||||
default: 'false'
|
|
||||||
runs:
|
runs:
|
||||||
using: "composite"
|
using: "composite"
|
||||||
steps:
|
steps:
|
||||||
- name: Start Ollama
|
- name: Start Ollama
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
|
docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
|
||||||
image="ollama-with-vision-model"
|
|
||||||
else
|
|
||||||
image="ollama-with-models"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Starting Ollama with image: $image"
|
|
||||||
docker run -d --name ollama -p 11434:11434 docker.io/llamastack/$image
|
|
||||||
echo "Verifying Ollama status..."
|
echo "Verifying Ollama status..."
|
||||||
timeout 30 bash -c 'while ! curl -s -L http://127.0.0.1:11434; do sleep 1 && echo "."; done'
|
timeout 30 bash -c 'while ! curl -s -L http://127.0.0.1:11434; do sleep 1 && echo "."; done'
|
||||||
|
|
11
.github/actions/setup-runner/action.yml
vendored
11
.github/actions/setup-runner/action.yml
vendored
|
@ -16,21 +16,19 @@ runs:
|
||||||
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
|
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
|
||||||
with:
|
with:
|
||||||
python-version: ${{ inputs.python-version }}
|
python-version: ${{ inputs.python-version }}
|
||||||
|
activate-environment: true
|
||||||
version: 0.7.6
|
version: 0.7.6
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
echo "Updating project dependencies via uv sync"
|
|
||||||
uv sync --all-groups
|
uv sync --all-groups
|
||||||
|
uv pip install ollama faiss-cpu
|
||||||
echo "Installing ad-hoc dependencies"
|
|
||||||
uv pip install faiss-cpu
|
|
||||||
|
|
||||||
# Install llama-stack-client-python based on the client-version input
|
# Install llama-stack-client-python based on the client-version input
|
||||||
if [ "${{ inputs.client-version }}" = "latest" ]; then
|
if [ "${{ inputs.client-version }}" = "latest" ]; then
|
||||||
echo "Installing latest llama-stack-client-python from main branch"
|
echo "Installing latest llama-stack-client-python from main branch"
|
||||||
uv pip install git+https://github.com/llamastack/llama-stack-client-python.git@main
|
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
|
||||||
elif [ "${{ inputs.client-version }}" = "published" ]; then
|
elif [ "${{ inputs.client-version }}" = "published" ]; then
|
||||||
echo "Installing published llama-stack-client-python from PyPI"
|
echo "Installing published llama-stack-client-python from PyPI"
|
||||||
uv pip install llama-stack-client
|
uv pip install llama-stack-client
|
||||||
|
@ -39,5 +37,4 @@ runs:
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Installed llama packages"
|
uv pip install -e .
|
||||||
uv pip list | grep llama
|
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
name: 'Setup Test Environment'
|
|
||||||
description: 'Common setup steps for integration tests including dependencies, providers, and build'
|
|
||||||
|
|
||||||
inputs:
|
|
||||||
python-version:
|
|
||||||
description: 'Python version to use'
|
|
||||||
required: true
|
|
||||||
client-version:
|
|
||||||
description: 'Client version (latest or published)'
|
|
||||||
required: true
|
|
||||||
provider:
|
|
||||||
description: 'Provider to setup (ollama or vllm)'
|
|
||||||
required: true
|
|
||||||
default: 'ollama'
|
|
||||||
run-vision-tests:
|
|
||||||
description: 'Whether to setup provider for vision tests'
|
|
||||||
required: false
|
|
||||||
default: 'false'
|
|
||||||
inference-mode:
|
|
||||||
description: 'Inference mode (record or replay)'
|
|
||||||
required: true
|
|
||||||
|
|
||||||
runs:
|
|
||||||
using: 'composite'
|
|
||||||
steps:
|
|
||||||
- name: Install dependencies
|
|
||||||
uses: ./.github/actions/setup-runner
|
|
||||||
with:
|
|
||||||
python-version: ${{ inputs.python-version }}
|
|
||||||
client-version: ${{ inputs.client-version }}
|
|
||||||
|
|
||||||
- name: Setup ollama
|
|
||||||
if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }}
|
|
||||||
uses: ./.github/actions/setup-ollama
|
|
||||||
with:
|
|
||||||
run-vision-tests: ${{ inputs.run-vision-tests }}
|
|
||||||
|
|
||||||
- name: Setup vllm
|
|
||||||
if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }}
|
|
||||||
uses: ./.github/actions/setup-vllm
|
|
||||||
|
|
||||||
- name: Build Llama Stack
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
# Install llama-stack-client-python based on the client-version input
|
|
||||||
if [ "${{ inputs.client-version }}" = "latest" ]; then
|
|
||||||
echo "Installing latest llama-stack-client-python from main branch"
|
|
||||||
export LLAMA_STACK_CLIENT_DIR=git+https://github.com/llamastack/llama-stack-client-python.git@main
|
|
||||||
elif [ "${{ inputs.client-version }}" = "published" ]; then
|
|
||||||
echo "Installing published llama-stack-client-python from PyPI"
|
|
||||||
unset LLAMA_STACK_CLIENT_DIR
|
|
||||||
else
|
|
||||||
echo "Invalid client-version: ${{ inputs.client-version }}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Building Llama Stack"
|
|
||||||
|
|
||||||
LLAMA_STACK_DIR=. \
|
|
||||||
uv run --no-sync llama stack build --template ci-tests --image-type venv
|
|
||||||
|
|
||||||
- name: Configure git for commits
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
|
||||||
git config --local user.name "github-actions[bot]"
|
|
12
.github/dependabot.yml
vendored
12
.github/dependabot.yml
vendored
|
@ -9,7 +9,6 @@ updates:
|
||||||
day: "saturday"
|
day: "saturday"
|
||||||
commit-message:
|
commit-message:
|
||||||
prefix: chore(github-deps)
|
prefix: chore(github-deps)
|
||||||
|
|
||||||
- package-ecosystem: "uv"
|
- package-ecosystem: "uv"
|
||||||
directory: "/"
|
directory: "/"
|
||||||
schedule:
|
schedule:
|
||||||
|
@ -20,14 +19,3 @@ updates:
|
||||||
- python
|
- python
|
||||||
commit-message:
|
commit-message:
|
||||||
prefix: chore(python-deps)
|
prefix: chore(python-deps)
|
||||||
|
|
||||||
- package-ecosystem: npm
|
|
||||||
directory: "/llama_stack/ui"
|
|
||||||
schedule:
|
|
||||||
interval: "weekly"
|
|
||||||
day: "saturday"
|
|
||||||
labels:
|
|
||||||
- type/dependencies
|
|
||||||
- javascript
|
|
||||||
commit-message:
|
|
||||||
prefix: chore(ui-deps)
|
|
||||||
|
|
7
.github/workflows/README.md
vendored
7
.github/workflows/README.md
vendored
|
@ -1,23 +1,22 @@
|
||||||
# Llama Stack CI
|
# Llama Stack CI
|
||||||
|
|
||||||
Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a table detailing what CI the project includes and the purpose.
|
Llama Stack uses GitHub Actions for Continous Integration (CI). Below is a table detailing what CI the project includes and the purpose.
|
||||||
|
|
||||||
| Name | File | Purpose |
|
| Name | File | Purpose |
|
||||||
| ---- | ---- | ------- |
|
| ---- | ---- | ------- |
|
||||||
| Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
|
| Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
|
||||||
|
| Coverage Badge | [coverage-badge.yml](coverage-badge.yml) | Creates PR for updating the code coverage badge |
|
||||||
| Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
|
| Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
|
||||||
| Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
|
| Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
|
||||||
| SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
|
| SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
|
||||||
| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode |
|
| Integration Tests | [integration-tests.yml](integration-tests.yml) | Run the integration test suite with Ollama |
|
||||||
| Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
|
| Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
|
||||||
| Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
|
| Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
|
||||||
| Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
|
| Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
|
||||||
| Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
|
| Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
|
||||||
| Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
|
|
||||||
| Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
|
| Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
|
||||||
| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
|
| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
|
||||||
| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
|
| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
|
||||||
| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
|
| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |
|
||||||
| UI Tests | [ui-unit-tests.yml](ui-unit-tests.yml) | Run the UI test suite |
|
|
||||||
| Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
|
| Unit Tests | [unit-tests.yml](unit-tests.yml) | Run the unit test suite |
|
||||||
| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
|
| Update ReadTheDocs | [update-readthedocs.yml](update-readthedocs.yml) | Update the Llama Stack ReadTheDocs site |
|
||||||
|
|
2
.github/workflows/changelog.yml
vendored
2
.github/workflows/changelog.yml
vendored
|
@ -17,7 +17,7 @@ jobs:
|
||||||
pull-requests: write # for peter-evans/create-pull-request to create a PR
|
pull-requests: write # for peter-evans/create-pull-request to create a PR
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
with:
|
with:
|
||||||
ref: main
|
ref: main
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
62
.github/workflows/coverage-badge.yml
vendored
Normal file
62
.github/workflows/coverage-badge.yml
vendored
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
name: Coverage Badge
|
||||||
|
|
||||||
|
run-name: Creates PR for updating the code coverage badge
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main ]
|
||||||
|
paths:
|
||||||
|
- 'llama_stack/**'
|
||||||
|
- 'tests/unit/**'
|
||||||
|
- 'uv.lock'
|
||||||
|
- 'pyproject.toml'
|
||||||
|
- 'requirements.txt'
|
||||||
|
- '.github/workflows/unit-tests.yml'
|
||||||
|
- '.github/workflows/coverage-badge.yml' # This workflow
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
unit-tests:
|
||||||
|
permissions:
|
||||||
|
contents: write # for peter-evans/create-pull-request to create branch
|
||||||
|
pull-requests: write # for peter-evans/create-pull-request to create a PR
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
||||||
|
- name: Run unit tests
|
||||||
|
run: |
|
||||||
|
./scripts/unit-tests.sh
|
||||||
|
|
||||||
|
- name: Coverage Badge
|
||||||
|
uses: tj-actions/coverage-badge-py@1788babcb24544eb5bbb6e0d374df5d1e54e670f # v2.0.4
|
||||||
|
|
||||||
|
- name: Verify Changed files
|
||||||
|
uses: tj-actions/verify-changed-files@a1c6acee9df209257a246f2cc6ae8cb6581c1edf # v20.0.4
|
||||||
|
id: verify-changed-files
|
||||||
|
with:
|
||||||
|
files: coverage.svg
|
||||||
|
|
||||||
|
- name: Commit files
|
||||||
|
if: steps.verify-changed-files.outputs.files_changed == 'true'
|
||||||
|
run: |
|
||||||
|
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
||||||
|
git config --local user.name "github-actions[bot]"
|
||||||
|
git add coverage.svg
|
||||||
|
git commit -m "Updated coverage.svg"
|
||||||
|
|
||||||
|
- name: Create Pull Request
|
||||||
|
if: steps.verify-changed-files.outputs.files_changed == 'true'
|
||||||
|
uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
title: "ci: [Automatic] Coverage Badge Update"
|
||||||
|
body: |
|
||||||
|
This PR updates the coverage badge based on the latest coverage report.
|
||||||
|
|
||||||
|
Automatically generated by the [workflow coverage-badge.yaml](.github/workflows/coverage-badge.yaml)
|
||||||
|
delete-branch: true
|
7
.github/workflows/install-script-ci.yml
vendored
7
.github/workflows/install-script-ci.yml
vendored
|
@ -16,22 +16,21 @@ jobs:
|
||||||
lint:
|
lint:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # 5.0.0
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
|
||||||
- name: Run ShellCheck on install.sh
|
- name: Run ShellCheck on install.sh
|
||||||
run: shellcheck scripts/install.sh
|
run: shellcheck scripts/install.sh
|
||||||
smoke-test-on-dev:
|
smoke-test-on-dev:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
||||||
- name: Build a single provider
|
- name: Build a single provider
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template starter --image-type container --image-name test
|
||||||
llama stack build --template starter --image-type container --image-name test
|
|
||||||
|
|
||||||
- name: Run installer end-to-end
|
- name: Run installer end-to-end
|
||||||
run: |
|
run: |
|
||||||
|
|
3
.github/workflows/integration-auth-tests.yml
vendored
3
.github/workflows/integration-auth-tests.yml
vendored
|
@ -10,7 +10,6 @@ on:
|
||||||
paths:
|
paths:
|
||||||
- 'distributions/**'
|
- 'distributions/**'
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/integration/**'
|
- 'tests/integration/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -31,7 +30,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
|
@ -44,7 +44,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
146
.github/workflows/integration-tests.yml
vendored
146
.github/workflows/integration-tests.yml
vendored
|
@ -1,23 +1,20 @@
|
||||||
name: Integration Tests (Replay)
|
name: Integration Tests
|
||||||
|
|
||||||
run-name: Run the integration test suite from tests/integration in replay mode
|
run-name: Run the integration test suite with Ollama
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
pull_request:
|
pull_request:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/**'
|
- 'tests/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
- 'requirements.txt'
|
||||||
- '.github/workflows/integration-tests.yml' # This workflow
|
- '.github/workflows/integration-tests.yml' # This workflow
|
||||||
- '.github/actions/setup-ollama/action.yml'
|
- '.github/actions/setup-ollama/action.yml'
|
||||||
- '.github/actions/setup-test-environment/action.yml'
|
|
||||||
- '.github/actions/run-and-record-tests/action.yml'
|
|
||||||
schedule:
|
schedule:
|
||||||
# If changing the cron schedule, update the provider in the test-matrix job
|
# If changing the cron schedule, update the provider in the test-matrix job
|
||||||
- cron: '0 0 * * *' # (test latest client) Daily at 12 AM UTC
|
- cron: '0 0 * * *' # (test latest client) Daily at 12 AM UTC
|
||||||
|
@ -32,56 +29,131 @@ on:
|
||||||
description: 'Test against a specific provider'
|
description: 'Test against a specific provider'
|
||||||
type: string
|
type: string
|
||||||
default: 'ollama'
|
default: 'ollama'
|
||||||
test-subdirs:
|
|
||||||
description: 'Comma-separated list of test subdirectories to run'
|
|
||||||
type: string
|
|
||||||
default: ''
|
|
||||||
test-pattern:
|
|
||||||
description: 'Regex pattern to pass to pytest -k'
|
|
||||||
type: string
|
|
||||||
default: ''
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
# Skip concurrency for pushes to main - each commit should be tested independently
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
discover-tests:
|
||||||
run-replay-mode-tests:
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
test-type: ${{ steps.generate-matrix.outputs.test-type }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
|
- name: Generate test matrix
|
||||||
|
id: generate-matrix
|
||||||
|
run: |
|
||||||
|
# Get test directories dynamically, excluding non-test directories
|
||||||
|
TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
|
||||||
|
grep -Ev "^(__pycache__|fixtures|test_cases)$" |
|
||||||
|
sort | jq -R -s -c 'split("\n")[:-1]')
|
||||||
|
echo "test-type=$TEST_TYPES" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
test-matrix:
|
||||||
|
needs: discover-tests
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
|
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
|
test-type: ${{ fromJson(needs.discover-tests.outputs.test-type) }}
|
||||||
client-type: [library, server]
|
client-type: [library, server]
|
||||||
# Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
|
# Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
|
||||||
provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
|
provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
|
||||||
# Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
|
python-version: ["3.12", "3.13"]
|
||||||
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
|
client-version: ${{ (github.event.schedule == '0 0 * * 0' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
|
||||||
client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
|
exclude: # TODO: look into why these tests are failing and fix them
|
||||||
run-vision-tests: [true, false]
|
- provider: vllm
|
||||||
|
test-type: safety
|
||||||
|
- provider: vllm
|
||||||
|
test-type: post_training
|
||||||
|
- provider: vllm
|
||||||
|
test-type: tool_runtime
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Setup test environment
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-test-environment
|
uses: ./.github/actions/setup-runner
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
client-version: ${{ matrix.client-version }}
|
client-version: ${{ matrix.client-version }}
|
||||||
provider: ${{ matrix.provider }}
|
|
||||||
run-vision-tests: ${{ matrix.run-vision-tests }}
|
|
||||||
inference-mode: 'replay'
|
|
||||||
|
|
||||||
- name: Run tests
|
- name: Setup ollama
|
||||||
uses: ./.github/actions/run-and-record-tests
|
if: ${{ matrix.provider == 'ollama' }}
|
||||||
|
uses: ./.github/actions/setup-ollama
|
||||||
|
|
||||||
|
- name: Setup vllm
|
||||||
|
if: ${{ matrix.provider == 'vllm' }}
|
||||||
|
uses: ./.github/actions/setup-vllm
|
||||||
|
|
||||||
|
- name: Build Llama Stack
|
||||||
|
run: |
|
||||||
|
uv run llama stack build --template ci-tests --image-type venv
|
||||||
|
|
||||||
|
- name: Check Storage and Memory Available Before Tests
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: |
|
||||||
|
free -h
|
||||||
|
df -h
|
||||||
|
|
||||||
|
- name: Run Integration Tests
|
||||||
|
env:
|
||||||
|
LLAMA_STACK_CLIENT_TIMEOUT: "300" # Increased timeout for eval operations
|
||||||
|
# Use 'shell' to get pipefail behavior
|
||||||
|
# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#exit-codes-and-error-action-preference
|
||||||
|
# TODO: write a precommit hook to detect if a test contains a pipe but does not use 'shell: bash'
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
if [ "${{ matrix.client-type }}" == "library" ]; then
|
||||||
|
stack_config="ci-tests"
|
||||||
|
else
|
||||||
|
stack_config="server:ci-tests"
|
||||||
|
fi
|
||||||
|
|
||||||
|
EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
|
||||||
|
if [ "${{ matrix.provider }}" == "ollama" ]; then
|
||||||
|
export OLLAMA_URL="http://0.0.0.0:11434"
|
||||||
|
export TEXT_MODEL=ollama/llama3.2:3b-instruct-fp16
|
||||||
|
export SAFETY_MODEL="ollama/llama-guard3:1b"
|
||||||
|
EXTRA_PARAMS="--safety-shield=llama-guard"
|
||||||
|
else
|
||||||
|
export VLLM_URL="http://localhost:8000/v1"
|
||||||
|
export TEXT_MODEL=vllm/meta-llama/Llama-3.2-1B-Instruct
|
||||||
|
# TODO: remove the not(test_inference_store_tool_calls) once we can get the tool called consistently
|
||||||
|
EXTRA_PARAMS=
|
||||||
|
EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
|
||||||
|
-k "not( ${EXCLUDE_TESTS} )" \
|
||||||
|
--text-model=$TEXT_MODEL \
|
||||||
|
--embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
|
||||||
|
--color=yes ${EXTRA_PARAMS} \
|
||||||
|
--capture=tee-sys | tee pytest-${{ matrix.test-type }}.log
|
||||||
|
|
||||||
|
- name: Check Storage and Memory Available After Tests
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: |
|
||||||
|
free -h
|
||||||
|
df -h
|
||||||
|
|
||||||
|
- name: Write inference logs to file
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: |
|
||||||
|
sudo docker logs ollama > ollama.log || true
|
||||||
|
sudo docker logs vllm > vllm.log || true
|
||||||
|
|
||||||
|
- name: Upload all logs to artifacts
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||||
with:
|
with:
|
||||||
test-subdirs: ${{ inputs.test-subdirs }}
|
name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.provider }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}
|
||||||
test-pattern: ${{ inputs.test-pattern }}
|
path: |
|
||||||
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
|
*.log
|
||||||
provider: ${{ matrix.provider }}
|
retention-days: 1
|
||||||
inference-mode: 'replay'
|
|
||||||
run-vision-tests: ${{ matrix.run-vision-tests }}
|
|
||||||
|
|
|
@ -9,17 +9,14 @@ on:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/integration/vector_io/**'
|
- 'tests/integration/vector_io/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
- 'requirements.txt'
|
- 'requirements.txt'
|
||||||
- '.github/workflows/integration-vector-io-tests.yml' # This workflow
|
- '.github/workflows/integration-vector-io-tests.yml' # This workflow
|
||||||
schedule:
|
|
||||||
- cron: '0 0 * * *' # (test on python 3.13) Daily at 12 AM UTC
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
@ -27,13 +24,13 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector", "remote::weaviate", "remote::qdrant"]
|
vector-io-provider: ["inline::faiss", "inline::sqlite-vec", "inline::milvus", "remote::chromadb", "remote::pgvector"]
|
||||||
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
|
python-version: ["3.12", "3.13"]
|
||||||
fail-fast: false # we want to run all tests regardless of failure
|
fail-fast: false # we want to run all tests regardless of failure
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -51,14 +48,6 @@ jobs:
|
||||||
-e ANONYMIZED_TELEMETRY=FALSE \
|
-e ANONYMIZED_TELEMETRY=FALSE \
|
||||||
chromadb/chroma:latest
|
chromadb/chroma:latest
|
||||||
|
|
||||||
- name: Setup Weaviate
|
|
||||||
if: matrix.vector-io-provider == 'remote::weaviate'
|
|
||||||
run: |
|
|
||||||
docker run --rm -d --pull always \
|
|
||||||
--name weaviate \
|
|
||||||
-p 8080:8080 -p 50051:50051 \
|
|
||||||
cr.weaviate.io/semitechnologies/weaviate:1.32.0
|
|
||||||
|
|
||||||
- name: Start PGVector DB
|
- name: Start PGVector DB
|
||||||
if: matrix.vector-io-provider == 'remote::pgvector'
|
if: matrix.vector-io-provider == 'remote::pgvector'
|
||||||
run: |
|
run: |
|
||||||
|
@ -89,29 +78,6 @@ jobs:
|
||||||
PGPASSWORD=llamastack psql -h localhost -U llamastack -d llamastack \
|
PGPASSWORD=llamastack psql -h localhost -U llamastack -d llamastack \
|
||||||
-c "CREATE EXTENSION IF NOT EXISTS vector;"
|
-c "CREATE EXTENSION IF NOT EXISTS vector;"
|
||||||
|
|
||||||
- name: Setup Qdrant
|
|
||||||
if: matrix.vector-io-provider == 'remote::qdrant'
|
|
||||||
run: |
|
|
||||||
docker run --rm -d --pull always \
|
|
||||||
--name qdrant \
|
|
||||||
-p 6333:6333 \
|
|
||||||
qdrant/qdrant
|
|
||||||
|
|
||||||
- name: Wait for Qdrant to be ready
|
|
||||||
if: matrix.vector-io-provider == 'remote::qdrant'
|
|
||||||
run: |
|
|
||||||
echo "Waiting for Qdrant to be ready..."
|
|
||||||
for i in {1..30}; do
|
|
||||||
if curl -s http://localhost:6333/collections | grep -q '"status":"ok"'; then
|
|
||||||
echo "Qdrant is ready!"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
echo "Qdrant failed to start"
|
|
||||||
docker logs qdrant
|
|
||||||
exit 1
|
|
||||||
|
|
||||||
- name: Wait for ChromaDB to be ready
|
- name: Wait for ChromaDB to be ready
|
||||||
if: matrix.vector-io-provider == 'remote::chromadb'
|
if: matrix.vector-io-provider == 'remote::chromadb'
|
||||||
run: |
|
run: |
|
||||||
|
@ -127,24 +93,9 @@ jobs:
|
||||||
docker logs chromadb
|
docker logs chromadb
|
||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
- name: Wait for Weaviate to be ready
|
|
||||||
if: matrix.vector-io-provider == 'remote::weaviate'
|
|
||||||
run: |
|
|
||||||
echo "Waiting for Weaviate to be ready..."
|
|
||||||
for i in {1..30}; do
|
|
||||||
if curl -s http://localhost:8080 | grep -q "https://weaviate.io/developers/weaviate/current/"; then
|
|
||||||
echo "Weaviate is ready!"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
echo "Weaviate failed to start"
|
|
||||||
docker logs weaviate
|
|
||||||
exit 1
|
|
||||||
|
|
||||||
- name: Build Llama Stack
|
- name: Build Llama Stack
|
||||||
run: |
|
run: |
|
||||||
uv run --no-sync llama stack build --template ci-tests --image-type venv
|
uv run llama stack build --template ci-tests --image-type venv
|
||||||
|
|
||||||
- name: Check Storage and Memory Available Before Tests
|
- name: Check Storage and Memory Available Before Tests
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
|
@ -162,15 +113,10 @@ jobs:
|
||||||
PGVECTOR_DB: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
|
PGVECTOR_DB: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
|
||||||
PGVECTOR_USER: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
|
PGVECTOR_USER: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
|
||||||
PGVECTOR_PASSWORD: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
|
PGVECTOR_PASSWORD: ${{ matrix.vector-io-provider == 'remote::pgvector' && 'llamastack' || '' }}
|
||||||
ENABLE_QDRANT: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'true' || '' }}
|
|
||||||
QDRANT_URL: ${{ matrix.vector-io-provider == 'remote::qdrant' && 'http://localhost:6333' || '' }}
|
|
||||||
ENABLE_WEAVIATE: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'true' || '' }}
|
|
||||||
WEAVIATE_CLUSTER_URL: ${{ matrix.vector-io-provider == 'remote::weaviate' && 'localhost:8080' || '' }}
|
|
||||||
run: |
|
run: |
|
||||||
uv run --no-sync \
|
uv run pytest -sv --stack-config="inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
|
||||||
pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
|
|
||||||
tests/integration/vector_io \
|
tests/integration/vector_io \
|
||||||
--embedding-model inline::sentence-transformers/all-MiniLM-L6-v2
|
--embedding-model sentence-transformers/all-MiniLM-L6-v2
|
||||||
|
|
||||||
- name: Check Storage and Memory Available After Tests
|
- name: Check Storage and Memory Available After Tests
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
|
@ -188,11 +134,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
docker logs chromadb > chromadb.log
|
docker logs chromadb > chromadb.log
|
||||||
|
|
||||||
- name: Write Qdrant logs to file
|
|
||||||
if: ${{ always() && matrix.vector-io-provider == 'remote::qdrant' }}
|
|
||||||
run: |
|
|
||||||
docker logs qdrant > qdrant.log
|
|
||||||
|
|
||||||
- name: Upload all logs to artifacts
|
- name: Upload all logs to artifacts
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||||
|
|
17
.github/workflows/pre-commit.yml
vendored
17
.github/workflows/pre-commit.yml
vendored
|
@ -20,7 +20,7 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
with:
|
with:
|
||||||
# For dependabot PRs, we need to checkout with a token that can push changes
|
# For dependabot PRs, we need to checkout with a token that can push changes
|
||||||
token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
|
token: ${{ github.actor == 'dependabot[bot]' && secrets.GITHUB_TOKEN || github.token }}
|
||||||
|
@ -36,21 +36,6 @@ jobs:
|
||||||
**/requirements*.txt
|
**/requirements*.txt
|
||||||
.pre-commit-config.yaml
|
.pre-commit-config.yaml
|
||||||
|
|
||||||
# npm ci may fail -
|
|
||||||
# npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
|
|
||||||
# npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
|
|
||||||
|
|
||||||
# - name: Set up Node.js
|
|
||||||
# uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
|
|
||||||
# with:
|
|
||||||
# node-version: '20'
|
|
||||||
# cache: 'npm'
|
|
||||||
# cache-dependency-path: 'llama_stack/ui/'
|
|
||||||
|
|
||||||
# - name: Install npm dependencies
|
|
||||||
# run: npm ci
|
|
||||||
# working-directory: llama_stack/ui
|
|
||||||
|
|
||||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
env:
|
env:
|
||||||
|
|
52
.github/workflows/providers-build.yml
vendored
52
.github/workflows/providers-build.yml
vendored
|
@ -9,20 +9,20 @@ on:
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/cli/stack/build.py'
|
- 'llama_stack/cli/stack/build.py'
|
||||||
- 'llama_stack/cli/stack/_build.py'
|
- 'llama_stack/cli/stack/_build.py'
|
||||||
- 'llama_stack/core/build.*'
|
- 'llama_stack/distribution/build.*'
|
||||||
- 'llama_stack/core/*.sh'
|
- 'llama_stack/distribution/*.sh'
|
||||||
- '.github/workflows/providers-build.yml'
|
- '.github/workflows/providers-build.yml'
|
||||||
- 'llama_stack/distributions/**'
|
- 'llama_stack/templates/**'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/cli/stack/build.py'
|
- 'llama_stack/cli/stack/build.py'
|
||||||
- 'llama_stack/cli/stack/_build.py'
|
- 'llama_stack/cli/stack/_build.py'
|
||||||
- 'llama_stack/core/build.*'
|
- 'llama_stack/distribution/build.*'
|
||||||
- 'llama_stack/core/*.sh'
|
- 'llama_stack/distribution/*.sh'
|
||||||
- '.github/workflows/providers-build.yml'
|
- '.github/workflows/providers-build.yml'
|
||||||
- 'llama_stack/distributions/**'
|
- 'llama_stack/templates/**'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
|
@ -33,42 +33,42 @@ jobs:
|
||||||
generate-matrix:
|
generate-matrix:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
outputs:
|
outputs:
|
||||||
distros: ${{ steps.set-matrix.outputs.distros }}
|
templates: ${{ steps.set-matrix.outputs.templates }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Generate Distribution List
|
- name: Generate Template List
|
||||||
id: set-matrix
|
id: set-matrix
|
||||||
run: |
|
run: |
|
||||||
distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
|
templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
|
||||||
echo "distros=$distros" >> "$GITHUB_OUTPUT"
|
echo "templates=$templates" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
build:
|
build:
|
||||||
needs: generate-matrix
|
needs: generate-matrix
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
|
template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
|
||||||
image-type: [venv, container]
|
image-type: [venv, container]
|
||||||
fail-fast: false # We want to run all jobs even if some fail
|
fail-fast: false # We want to run all jobs even if some fail
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
||||||
- name: Print build dependencies
|
- name: Print build dependencies
|
||||||
run: |
|
run: |
|
||||||
uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
|
uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
|
||||||
|
|
||||||
- name: Run Llama Stack Build
|
- name: Run Llama Stack Build
|
||||||
run: |
|
run: |
|
||||||
# USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
|
# USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
|
||||||
# LLAMA_STACK_DIR is set to the current directory so we are building from the source
|
# LLAMA_STACK_DIR is set to the current directory so we are building from the source
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
|
||||||
|
|
||||||
- name: Print dependencies in the image
|
- name: Print dependencies in the image
|
||||||
if: matrix.image-type == 'venv'
|
if: matrix.image-type == 'venv'
|
||||||
|
@ -79,7 +79,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -92,23 +92,23 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
||||||
- name: Build a single provider
|
- name: Build a single provider
|
||||||
run: |
|
run: |
|
||||||
yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml
|
yq -i '.image_type = "container"' llama_stack/templates/ci-tests/build.yaml
|
||||||
yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml
|
yq -i '.image_name = "test"' llama_stack/templates/ci-tests/build.yaml
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/ci-tests/build.yaml
|
||||||
|
|
||||||
- name: Inspect the container image entrypoint
|
- name: Inspect the container image entrypoint
|
||||||
run: |
|
run: |
|
||||||
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
||||||
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
||||||
echo "Entrypoint: $entrypoint"
|
echo "Entrypoint: $entrypoint"
|
||||||
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
|
if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
|
||||||
echo "Entrypoint is not correct"
|
echo "Entrypoint is not correct"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
@ -117,32 +117,32 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
||||||
- name: Pin distribution to UBI9 base
|
- name: Pin template to UBI9 base
|
||||||
run: |
|
run: |
|
||||||
yq -i '
|
yq -i '
|
||||||
.image_type = "container" |
|
.image_type = "container" |
|
||||||
.image_name = "ubi9-test" |
|
.image_name = "ubi9-test" |
|
||||||
.distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
|
.distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
|
||||||
' llama_stack/distributions/ci-tests/build.yaml
|
' llama_stack/templates/ci-tests/build.yaml
|
||||||
|
|
||||||
- name: Build dev container (UBI9)
|
- name: Build dev container (UBI9)
|
||||||
env:
|
env:
|
||||||
USE_COPY_NOT_MOUNT: "true"
|
USE_COPY_NOT_MOUNT: "true"
|
||||||
LLAMA_STACK_DIR: "."
|
LLAMA_STACK_DIR: "."
|
||||||
run: |
|
run: |
|
||||||
uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml
|
uv run llama stack build --config llama_stack/templates/ci-tests/build.yaml
|
||||||
|
|
||||||
- name: Inspect UBI9 image
|
- name: Inspect UBI9 image
|
||||||
run: |
|
run: |
|
||||||
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
||||||
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
||||||
echo "Entrypoint: $entrypoint"
|
echo "Entrypoint: $entrypoint"
|
||||||
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
|
if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
|
||||||
echo "Entrypoint is not correct"
|
echo "Entrypoint is not correct"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
6
.github/workflows/python-build-test.yml
vendored
6
.github/workflows/python-build-test.yml
vendored
|
@ -9,8 +9,6 @@ on:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
- main
|
- main
|
||||||
paths-ignore:
|
|
||||||
- 'llama_stack/ui/**'
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
|
@ -21,10 +19,10 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install uv
|
- name: Install uv
|
||||||
uses: astral-sh/setup-uv@d9e0f98d3fc6adb07d1e3d37f3043649ddad06a1 # v6.5.0
|
uses: astral-sh/setup-uv@e92bafb6253dcd438e0484186d7669ea7a8ca1cc # v6.4.3
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
activate-environment: true
|
activate-environment: true
|
||||||
|
|
70
.github/workflows/record-integration-tests.yml
vendored
70
.github/workflows/record-integration-tests.yml
vendored
|
@ -1,70 +0,0 @@
|
||||||
# This workflow should be run manually when needing to re-record tests. This happens when you have
|
|
||||||
# - added a new test
|
|
||||||
# - or changed an existing test such that a new inference call is made
|
|
||||||
# You should make a PR and then run this workflow on that PR branch. The workflow will re-record the
|
|
||||||
# tests and commit the recordings to the PR branch.
|
|
||||||
name: Integration Tests (Record)
|
|
||||||
|
|
||||||
run-name: Run the integration test suite from tests/integration
|
|
||||||
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
test-subdirs:
|
|
||||||
description: 'Comma-separated list of test subdirectories to run'
|
|
||||||
type: string
|
|
||||||
default: ''
|
|
||||||
test-provider:
|
|
||||||
description: 'Test against a specific provider'
|
|
||||||
type: string
|
|
||||||
default: 'ollama'
|
|
||||||
run-vision-tests:
|
|
||||||
description: 'Whether to run vision tests'
|
|
||||||
type: boolean
|
|
||||||
default: false
|
|
||||||
test-pattern:
|
|
||||||
description: 'Regex pattern to pass to pytest -k'
|
|
||||||
type: string
|
|
||||||
default: ''
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
record-tests:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: write
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Echo workflow inputs
|
|
||||||
run: |
|
|
||||||
echo "::group::Workflow Inputs"
|
|
||||||
echo "test-subdirs: ${{ inputs.test-subdirs }}"
|
|
||||||
echo "test-provider: ${{ inputs.test-provider }}"
|
|
||||||
echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
|
|
||||||
echo "test-pattern: ${{ inputs.test-pattern }}"
|
|
||||||
echo "branch: ${{ github.ref_name }}"
|
|
||||||
echo "::endgroup::"
|
|
||||||
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Setup test environment
|
|
||||||
uses: ./.github/actions/setup-test-environment
|
|
||||||
with:
|
|
||||||
python-version: "3.12" # Use single Python version for recording
|
|
||||||
client-version: "latest"
|
|
||||||
provider: ${{ inputs.test-provider || 'ollama' }}
|
|
||||||
run-vision-tests: ${{ inputs.run-vision-tests }}
|
|
||||||
inference-mode: 'record'
|
|
||||||
|
|
||||||
- name: Run and record tests
|
|
||||||
uses: ./.github/actions/run-and-record-tests
|
|
||||||
with:
|
|
||||||
test-pattern: ${{ inputs.test-pattern }}
|
|
||||||
test-subdirs: ${{ inputs.test-subdirs }}
|
|
||||||
stack-config: 'server:ci-tests' # recording must be done with server since more tests are run
|
|
||||||
provider: ${{ inputs.test-provider || 'ollama' }}
|
|
||||||
inference-mode: 'record'
|
|
||||||
run-vision-tests: ${{ inputs.run-vision-tests }}
|
|
4
.github/workflows/semantic-pr.yml
vendored
4
.github/workflows/semantic-pr.yml
vendored
|
@ -11,7 +11,7 @@ on:
|
||||||
- synchronize
|
- synchronize
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
|
@ -22,6 +22,6 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check PR Title's semantic conformance
|
- name: Check PR Title's semantic conformance
|
||||||
uses: amannn/action-semantic-pull-request@7f33ba792281b034f64e96f4c0b5496782dd3b37 # v6.1.0
|
uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
|
@ -12,13 +12,12 @@ on:
|
||||||
- 'tests/integration/**'
|
- 'tests/integration/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
- 'requirements.txt'
|
||||||
- 'tests/external/*'
|
- 'tests/external/*'
|
||||||
- '.github/workflows/test-external-provider-module.yml' # This workflow
|
- '.github/workflows/test-external-provider-module.yml' # This workflow
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
test-external-providers-from-module:
|
test-external-providers-from-module:
|
||||||
# This workflow is disabled. See https://github.com/meta-llama/llama-stack/pull/2975#issuecomment-3138702984 for details
|
|
||||||
if: false
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
@ -27,7 +26,7 @@ jobs:
|
||||||
# container and point 'uv pip install' to the correct path...
|
# container and point 'uv pip install' to the correct path...
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -48,7 +47,7 @@ jobs:
|
||||||
|
|
||||||
- name: Build distro from config file
|
- name: Build distro from config file
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. llama stack build --config tests/external/ramalama-stack/build.yaml
|
||||||
|
|
||||||
- name: Start Llama Stack server in background
|
- name: Start Llama Stack server in background
|
||||||
if: ${{ matrix.image-type }} == 'venv'
|
if: ${{ matrix.image-type }} == 'venv'
|
||||||
|
|
7
.github/workflows/test-external.yml
vendored
7
.github/workflows/test-external.yml
vendored
|
@ -9,7 +9,6 @@ on:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/integration/**'
|
- 'tests/integration/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -27,7 +26,7 @@ jobs:
|
||||||
# container and point 'uv pip install' to the correct path...
|
# container and point 'uv pip install' to the correct path...
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
@ -44,11 +43,11 @@ jobs:
|
||||||
|
|
||||||
- name: Print distro dependencies
|
- name: Print distro dependencies
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. llama stack build --config tests/external/build.yaml --print-deps-only
|
||||||
|
|
||||||
- name: Build distro from config file
|
- name: Build distro from config file
|
||||||
run: |
|
run: |
|
||||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml
|
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. llama stack build --config tests/external/build.yaml
|
||||||
|
|
||||||
- name: Start Llama Stack server in background
|
- name: Start Llama Stack server in background
|
||||||
if: ${{ matrix.image-type }} == 'venv'
|
if: ${{ matrix.image-type }} == 'venv'
|
||||||
|
|
55
.github/workflows/ui-unit-tests.yml
vendored
55
.github/workflows/ui-unit-tests.yml
vendored
|
@ -1,55 +0,0 @@
|
||||||
name: UI Tests
|
|
||||||
|
|
||||||
run-name: Run the UI test suite
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: [ main ]
|
|
||||||
pull_request:
|
|
||||||
branches: [ main ]
|
|
||||||
paths:
|
|
||||||
- 'llama_stack/ui/**'
|
|
||||||
- '.github/workflows/ui-unit-tests.yml' # This workflow
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
ui-tests:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
node-version: [22]
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout repository
|
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
|
||||||
|
|
||||||
- name: Setup Node.js
|
|
||||||
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
|
|
||||||
with:
|
|
||||||
node-version: ${{ matrix.node-version }}
|
|
||||||
cache: 'npm'
|
|
||||||
cache-dependency-path: 'llama_stack/ui/package-lock.json'
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
working-directory: llama_stack/ui
|
|
||||||
run: npm ci
|
|
||||||
|
|
||||||
- name: Run linting
|
|
||||||
working-directory: llama_stack/ui
|
|
||||||
run: npm run lint
|
|
||||||
|
|
||||||
- name: Run format check
|
|
||||||
working-directory: llama_stack/ui
|
|
||||||
run: npm run format:check
|
|
||||||
|
|
||||||
- name: Run unit tests
|
|
||||||
working-directory: llama_stack/ui
|
|
||||||
env:
|
|
||||||
CI: true
|
|
||||||
|
|
||||||
run: npm test -- --coverage --watchAll=false --passWithNoTests
|
|
3
.github/workflows/unit-tests.yml
vendored
3
.github/workflows/unit-tests.yml
vendored
|
@ -9,7 +9,6 @@ on:
|
||||||
branches: [ main ]
|
branches: [ main ]
|
||||||
paths:
|
paths:
|
||||||
- 'llama_stack/**'
|
- 'llama_stack/**'
|
||||||
- '!llama_stack/ui/**'
|
|
||||||
- 'tests/unit/**'
|
- 'tests/unit/**'
|
||||||
- 'uv.lock'
|
- 'uv.lock'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
@ -32,7 +31,7 @@ jobs:
|
||||||
- "3.13"
|
- "3.13"
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
2
.github/workflows/update-readthedocs.yml
vendored
2
.github/workflows/update-readthedocs.yml
vendored
|
@ -37,7 +37,7 @@ jobs:
|
||||||
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
|
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
uses: ./.github/actions/setup-runner
|
uses: ./.github/actions/setup-runner
|
||||||
|
|
|
@ -2,7 +2,6 @@ exclude: 'build/'
|
||||||
|
|
||||||
default_language_version:
|
default_language_version:
|
||||||
python: python3.12
|
python: python3.12
|
||||||
node: "22"
|
|
||||||
|
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
@ -146,50 +145,6 @@ repos:
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
require_serial: true
|
require_serial: true
|
||||||
files: ^.github/workflows/.*$
|
files: ^.github/workflows/.*$
|
||||||
# ui-prettier and ui-eslint are disabled until we can avoid `npm ci`, which is slow and may fail -
|
|
||||||
# npm error `npm ci` can only install packages when your package.json and package-lock.json or npm-shrinkwrap.json are in sync. Please update your lock file with `npm install` before continuing.
|
|
||||||
# npm error Invalid: lock file's llama-stack-client@0.2.17 does not satisfy llama-stack-client@0.2.18
|
|
||||||
# and until we have infra for installing prettier and next via npm -
|
|
||||||
# Lint UI code with ESLint.....................................................Failed
|
|
||||||
# - hook id: ui-eslint
|
|
||||||
# - exit code: 127
|
|
||||||
# > ui@0.1.0 lint
|
|
||||||
# > next lint --fix --quiet
|
|
||||||
# sh: line 1: next: command not found
|
|
||||||
#
|
|
||||||
# - id: ui-prettier
|
|
||||||
# name: Format UI code with Prettier
|
|
||||||
# entry: bash -c 'cd llama_stack/ui && npm ci && npm run format'
|
|
||||||
# language: system
|
|
||||||
# files: ^llama_stack/ui/.*\.(ts|tsx)$
|
|
||||||
# pass_filenames: false
|
|
||||||
# require_serial: true
|
|
||||||
# - id: ui-eslint
|
|
||||||
# name: Lint UI code with ESLint
|
|
||||||
# entry: bash -c 'cd llama_stack/ui && npm run lint -- --fix --quiet'
|
|
||||||
# language: system
|
|
||||||
# files: ^llama_stack/ui/.*\.(ts|tsx)$
|
|
||||||
# pass_filenames: false
|
|
||||||
# require_serial: true
|
|
||||||
|
|
||||||
- id: check-log-usage
|
|
||||||
name: Ensure 'llama_stack.log' usage for logging
|
|
||||||
entry: bash
|
|
||||||
language: system
|
|
||||||
types: [python]
|
|
||||||
pass_filenames: true
|
|
||||||
args:
|
|
||||||
- -c
|
|
||||||
- |
|
|
||||||
matches=$(grep -EnH '^[^#]*\b(import\s+logging|from\s+logging\b)' "$@" | grep -v -e '#\s*allow-direct-logging' || true)
|
|
||||||
if [ -n "$matches" ]; then
|
|
||||||
# GitHub Actions annotation format
|
|
||||||
while IFS=: read -r file line_num rest; do
|
|
||||||
echo "::error file=$file,line=$line_num::Do not use 'import logging' or 'from logging import' in $file. Use the custom log instead: from llama_stack.log import get_logger; logger = get_logger(). If direct logging is truly needed, add: # allow-direct-logging"
|
|
||||||
done <<< "$matches"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
exit 0
|
|
||||||
|
|
||||||
ci:
|
ci:
|
||||||
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
|
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
|
||||||
|
|
|
@ -451,7 +451,7 @@ GenAI application developers need more than just an LLM - they need to integrate
|
||||||
|
|
||||||
Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
|
Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
|
||||||
|
|
||||||
With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
|
With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stack’s plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
|
||||||
|
|
||||||
## Release
|
## Release
|
||||||
After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
|
After iterating on the APIs for the last 3 months, today we’re launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
|
||||||
|
|
174
CONTRIBUTING.md
174
CONTRIBUTING.md
|
@ -1,82 +1,13 @@
|
||||||
# Contributing to Llama Stack
|
# Contributing to Llama-Stack
|
||||||
We want to make contributing to this project as easy and transparent as
|
We want to make contributing to this project as easy and transparent as
|
||||||
possible.
|
possible.
|
||||||
|
|
||||||
## Set up your development environment
|
|
||||||
|
|
||||||
We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
|
|
||||||
You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
|
|
||||||
|
|
||||||
You can install the dependencies by running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd llama-stack
|
|
||||||
uv sync --group dev
|
|
||||||
uv pip install -e .
|
|
||||||
source .venv/bin/activate
|
|
||||||
```
|
|
||||||
|
|
||||||
```{note}
|
|
||||||
You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`).
|
|
||||||
Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
|
|
||||||
For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
|
|
||||||
```
|
|
||||||
|
|
||||||
Note that you can create a dotenv file `.env` that includes necessary environment variables:
|
|
||||||
```
|
|
||||||
LLAMA_STACK_BASE_URL=http://localhost:8321
|
|
||||||
LLAMA_STACK_CLIENT_LOG=debug
|
|
||||||
LLAMA_STACK_PORT=8321
|
|
||||||
LLAMA_STACK_CONFIG=<provider-name>
|
|
||||||
TAVILY_SEARCH_API_KEY=
|
|
||||||
BRAVE_SEARCH_API_KEY=
|
|
||||||
```
|
|
||||||
|
|
||||||
And then use this dotenv file when running client SDK tests via the following:
|
|
||||||
```bash
|
|
||||||
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pre-commit Hooks
|
|
||||||
|
|
||||||
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv run pre-commit install
|
|
||||||
```
|
|
||||||
|
|
||||||
After that, pre-commit hooks will run automatically before each commit.
|
|
||||||
|
|
||||||
Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
uv run pre-commit run --all-files
|
|
||||||
```
|
|
||||||
|
|
||||||
```{caution}
|
|
||||||
Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Discussions -> Issues -> Pull Requests
|
## Discussions -> Issues -> Pull Requests
|
||||||
|
|
||||||
We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
|
We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
|
||||||
|
|
||||||
If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
|
If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
|
||||||
|
|
||||||
### Issues
|
|
||||||
We use GitHub issues to track public bugs. Please ensure your description is
|
|
||||||
clear and has sufficient instructions to be able to reproduce the issue.
|
|
||||||
|
|
||||||
Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
|
|
||||||
disclosure of security bugs. In those cases, please go through the process
|
|
||||||
outlined on that page and do not file a public issue.
|
|
||||||
|
|
||||||
### Contributor License Agreement ("CLA")
|
|
||||||
In order to accept your pull request, we need you to submit a CLA. You only need
|
|
||||||
to do this once to work on any of Meta's open source projects.
|
|
||||||
|
|
||||||
Complete your CLA here: <https://code.facebook.com/cla>
|
|
||||||
|
|
||||||
**I'd like to contribute!**
|
**I'd like to contribute!**
|
||||||
|
|
||||||
If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
|
If you are new to the project, start by looking at the issues tagged with "good first issue". If you're interested
|
||||||
|
@ -120,15 +51,93 @@ Please avoid picking up too many issues at once. This helps you stay focused and
|
||||||
|
|
||||||
Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
|
Please keep pull requests (PRs) small and focused. If you have a large set of changes, consider splitting them into logically grouped, smaller PRs to facilitate review and testing.
|
||||||
|
|
||||||
```{tip}
|
> [!TIP]
|
||||||
As a general guideline:
|
> As a general guideline:
|
||||||
- Experienced contributors should try to keep no more than 5 open PRs at a time.
|
> - Experienced contributors should try to keep no more than 5 open PRs at a time.
|
||||||
- New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
|
> - New contributors are encouraged to have only one open PR at a time until they’re familiar with the codebase and process.
|
||||||
|
|
||||||
|
## Contributor License Agreement ("CLA")
|
||||||
|
In order to accept your pull request, we need you to submit a CLA. You only need
|
||||||
|
to do this once to work on any of Meta's open source projects.
|
||||||
|
|
||||||
|
Complete your CLA here: <https://code.facebook.com/cla>
|
||||||
|
|
||||||
|
## Issues
|
||||||
|
We use GitHub issues to track public bugs. Please ensure your description is
|
||||||
|
clear and has sufficient instructions to be able to reproduce the issue.
|
||||||
|
|
||||||
|
Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
|
||||||
|
disclosure of security bugs. In those cases, please go through the process
|
||||||
|
outlined on that page and do not file a public issue.
|
||||||
|
|
||||||
|
|
||||||
|
## Set up your development environment
|
||||||
|
|
||||||
|
We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
|
||||||
|
You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
|
||||||
|
|
||||||
|
You can install the dependencies by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd llama-stack
|
||||||
|
uv sync --group dev
|
||||||
|
uv pip install -e .
|
||||||
|
source .venv/bin/activate
|
||||||
```
|
```
|
||||||
|
|
||||||
## Repository guidelines
|
> [!NOTE]
|
||||||
|
> You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`)
|
||||||
|
> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
|
||||||
|
> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
|
||||||
|
|
||||||
### Coding Style
|
Note that you can create a dotenv file `.env` that includes necessary environment variables:
|
||||||
|
```
|
||||||
|
LLAMA_STACK_BASE_URL=http://localhost:8321
|
||||||
|
LLAMA_STACK_CLIENT_LOG=debug
|
||||||
|
LLAMA_STACK_PORT=8321
|
||||||
|
LLAMA_STACK_CONFIG=<provider-name>
|
||||||
|
TAVILY_SEARCH_API_KEY=
|
||||||
|
BRAVE_SEARCH_API_KEY=
|
||||||
|
```
|
||||||
|
|
||||||
|
And then use this dotenv file when running client SDK tests via the following:
|
||||||
|
```bash
|
||||||
|
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pre-commit Hooks
|
||||||
|
|
||||||
|
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv run pre-commit install
|
||||||
|
```
|
||||||
|
|
||||||
|
After that, pre-commit hooks will run automatically before each commit.
|
||||||
|
|
||||||
|
Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv run pre-commit run --all-files
|
||||||
|
```
|
||||||
|
|
||||||
|
> [!CAUTION]
|
||||||
|
> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
|
||||||
|
|
||||||
|
## Running tests
|
||||||
|
|
||||||
|
You can find the Llama Stack testing documentation [here](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md).
|
||||||
|
|
||||||
|
## Adding a new dependency to the project
|
||||||
|
|
||||||
|
To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv add foo
|
||||||
|
uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
## Coding Style
|
||||||
|
|
||||||
* Comments should provide meaningful insights into the code. Avoid filler comments that simply
|
* Comments should provide meaningful insights into the code. Avoid filler comments that simply
|
||||||
describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
||||||
|
@ -148,11 +157,6 @@ As a general guideline:
|
||||||
that describes the configuration. These descriptions will be used to generate the provider
|
that describes the configuration. These descriptions will be used to generate the provider
|
||||||
documentation.
|
documentation.
|
||||||
* When possible, use keyword arguments only when calling functions.
|
* When possible, use keyword arguments only when calling functions.
|
||||||
* Llama Stack utilizes [custom Exception classes](llama_stack/apis/common/errors.py) for certain Resources that should be used where applicable.
|
|
||||||
|
|
||||||
### License
|
|
||||||
By contributing to Llama, you agree that your contributions will be licensed
|
|
||||||
under the LICENSE file in the root directory of this source tree.
|
|
||||||
|
|
||||||
## Common Tasks
|
## Common Tasks
|
||||||
|
|
||||||
|
@ -160,7 +164,7 @@ Some tips about common tasks you work on while contributing to Llama Stack:
|
||||||
|
|
||||||
### Using `llama stack build`
|
### Using `llama stack build`
|
||||||
|
|
||||||
Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
|
Building a stack image (conda / docker) will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
```bash
|
```bash
|
||||||
|
@ -168,7 +172,7 @@ cd work/
|
||||||
git clone https://github.com/meta-llama/llama-stack.git
|
git clone https://github.com/meta-llama/llama-stack.git
|
||||||
git clone https://github.com/meta-llama/llama-stack-client-python.git
|
git clone https://github.com/meta-llama/llama-stack-client-python.git
|
||||||
cd llama-stack
|
cd llama-stack
|
||||||
LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
|
LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --template <...>
|
||||||
```
|
```
|
||||||
|
|
||||||
### Updating distribution configurations
|
### Updating distribution configurations
|
||||||
|
@ -206,3 +210,7 @@ uv run ./docs/openapi_generator/run_openapi_generator.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
|
The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
|
||||||
|
|
||||||
|
## License
|
||||||
|
By contributing to Llama, you agree that your contributions will be licensed
|
||||||
|
under the LICENSE file in the root directory of this source tree.
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
include llama_stack/models/llama/llama3/tokenizer.model
|
include llama_stack/models/llama/llama3/tokenizer.model
|
||||||
include llama_stack/models/llama/llama4/tokenizer.model
|
include llama_stack/models/llama/llama4/tokenizer.model
|
||||||
include llama_stack/core/*.sh
|
include llama_stack/distribution/*.sh
|
||||||
include llama_stack/cli/scripts/*.sh
|
include llama_stack/cli/scripts/*.sh
|
||||||
include llama_stack/distributions/*/*.yaml
|
include llama_stack/templates/*/*.yaml
|
||||||
include llama_stack/providers/tests/test_cases/inference/*.json
|
include llama_stack/providers/tests/test_cases/inference/*.json
|
||||||
include llama_stack/models/llama/*/*.md
|
include llama_stack/models/llama/*/*.md
|
||||||
include llama_stack/tests/integration/*.jpg
|
include llama_stack/tests/integration/*.jpg
|
||||||
|
|
22
README.md
22
README.md
|
@ -6,10 +6,10 @@
|
||||||
[](https://discord.gg/llama-stack)
|
[](https://discord.gg/llama-stack)
|
||||||
[](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
|
[](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
|
||||||
[](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
|
[](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
|
||||||
|

|
||||||
|
|
||||||
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
|
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
|
||||||
|
|
||||||
|
|
||||||
### ✨🎉 Llama 4 Support 🎉✨
|
### ✨🎉 Llama 4 Support 🎉✨
|
||||||
We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
|
We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
|
||||||
|
|
||||||
|
@ -112,7 +112,7 @@ Here is a list of the various API providers and available distributions that can
|
||||||
Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/providers/index.html)
|
Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/providers/index.html)
|
||||||
|
|
||||||
| API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
|
| API Provider Builder | Environments | Agents | Inference | VectorIO | Safety | Telemetry | Post Training | Eval | DatasetIO |
|
||||||
|:--------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
|
|:-------------------:|:------------:|:------:|:---------:|:--------:|:------:|:---------:|:-------------:|:----:|:--------:|
|
||||||
| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| SambaNova | Hosted | | ✅ | | ✅ | | | | |
|
| SambaNova | Hosted | | ✅ | | ✅ | | | | |
|
||||||
| Cerebras | Hosted | | ✅ | | | | | | |
|
| Cerebras | Hosted | | ✅ | | | | | | |
|
||||||
|
@ -124,10 +124,6 @@ Please checkout for [full list](https://llama-stack.readthedocs.io/en/latest/pro
|
||||||
| TGI | Hosted/Single Node | | ✅ | | | | | | |
|
| TGI | Hosted/Single Node | | ✅ | | | | | | |
|
||||||
| NVIDIA NIM | Hosted/Single Node | | ✅ | | ✅ | | | | |
|
| NVIDIA NIM | Hosted/Single Node | | ✅ | | ✅ | | | | |
|
||||||
| ChromaDB | Hosted/Single Node | | | ✅ | | | | | |
|
| ChromaDB | Hosted/Single Node | | | ✅ | | | | | |
|
||||||
| Milvus | Hosted/Single Node | | | ✅ | | | | | |
|
|
||||||
| Qdrant | Hosted/Single Node | | | ✅ | | | | | |
|
|
||||||
| Weaviate | Hosted/Single Node | | | ✅ | | | | | |
|
|
||||||
| SQLite-vec | Single Node | | | ✅ | | | | | |
|
|
||||||
| PG Vector | Single Node | | | ✅ | | | | | |
|
| PG Vector | Single Node | | | ✅ | | | | | |
|
||||||
| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | | | |
|
| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | | | |
|
||||||
| vLLM | Single Node | | ✅ | | | | | | |
|
| vLLM | Single Node | | ✅ | | | | | | |
|
||||||
|
@ -180,17 +176,3 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
|
||||||
Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
|
Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
|
||||||
|
|
||||||
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
|
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
|
||||||
|
|
||||||
|
|
||||||
## 🌟 GitHub Star History
|
|
||||||
## Star History
|
|
||||||
|
|
||||||
[](https://www.star-history.com/#meta-llama/llama-stack&Date)
|
|
||||||
|
|
||||||
## ✨ Contributors
|
|
||||||
|
|
||||||
Thanks to all of our amazing contributors!
|
|
||||||
|
|
||||||
<a href="https://github.com/meta-llama/llama-stack/graphs/contributors">
|
|
||||||
<img src="https://contrib.rocks/image?repo=meta-llama/llama-stack" />
|
|
||||||
</a>
|
|
14
docs/_static/js/keyboard_shortcuts.js
vendored
14
docs/_static/js/keyboard_shortcuts.js
vendored
|
@ -1,14 +0,0 @@
|
||||||
document.addEventListener('keydown', function(event) {
|
|
||||||
// command+K or ctrl+K
|
|
||||||
if ((event.metaKey || event.ctrlKey) && event.key === 'k') {
|
|
||||||
event.preventDefault();
|
|
||||||
document.querySelector('.search-input, .search-field, input[name="q"]').focus();
|
|
||||||
}
|
|
||||||
|
|
||||||
// forward slash
|
|
||||||
if (event.key === '/' &&
|
|
||||||
!event.target.matches('input, textarea, select')) {
|
|
||||||
event.preventDefault();
|
|
||||||
document.querySelector('.search-input, .search-field, input[name="q"]').focus();
|
|
||||||
}
|
|
||||||
});
|
|
2552
docs/_static/llama-stack-spec.html
vendored
2552
docs/_static/llama-stack-spec.html
vendored
File diff suppressed because it is too large
Load diff
1612
docs/_static/llama-stack-spec.yaml
vendored
1612
docs/_static/llama-stack-spec.yaml
vendored
File diff suppressed because it is too large
Load diff
|
@ -123,7 +123,7 @@
|
||||||
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
|
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# this command installs all the dependencies needed for the llama stack server with the together inference provider\n",
|
"# this command installs all the dependencies needed for the llama stack server with the together inference provider\n",
|
||||||
"!uv run --with llama-stack llama stack build --distro together --image-type venv \n",
|
"!uv run --with llama-stack llama stack build --template together --image-type venv \n",
|
||||||
"\n",
|
"\n",
|
||||||
"def run_llama_stack_server_background():\n",
|
"def run_llama_stack_server_background():\n",
|
||||||
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
|
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
|
||||||
|
@ -165,7 +165,7 @@
|
||||||
"# use this helper if needed to kill the server \n",
|
"# use this helper if needed to kill the server \n",
|
||||||
"def kill_llama_stack_server():\n",
|
"def kill_llama_stack_server():\n",
|
||||||
" # Kill any existing llama stack server processes\n",
|
" # Kill any existing llama stack server processes\n",
|
||||||
" os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
|
" os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -233,7 +233,7 @@
|
||||||
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
|
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# this command installs all the dependencies needed for the llama stack server \n",
|
"# this command installs all the dependencies needed for the llama stack server \n",
|
||||||
"!uv run --with llama-stack llama stack build --distro meta-reference-gpu --image-type venv \n",
|
"!uv run --with llama-stack llama stack build --template meta-reference-gpu --image-type venv \n",
|
||||||
"\n",
|
"\n",
|
||||||
"def run_llama_stack_server_background():\n",
|
"def run_llama_stack_server_background():\n",
|
||||||
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
|
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
|
||||||
|
@ -275,7 +275,7 @@
|
||||||
"# use this helper if needed to kill the server \n",
|
"# use this helper if needed to kill the server \n",
|
||||||
"def kill_llama_stack_server():\n",
|
"def kill_llama_stack_server():\n",
|
||||||
" # Kill any existing llama stack server processes\n",
|
" # Kill any existing llama stack server processes\n",
|
||||||
" os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
|
" os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -223,7 +223,7 @@
|
||||||
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
|
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# this command installs all the dependencies needed for the llama stack server \n",
|
"# this command installs all the dependencies needed for the llama stack server \n",
|
||||||
"!uv run --with llama-stack llama stack build --distro llama_api --image-type venv \n",
|
"!uv run --with llama-stack llama stack build --template llama_api --image-type venv \n",
|
||||||
"\n",
|
"\n",
|
||||||
"def run_llama_stack_server_background():\n",
|
"def run_llama_stack_server_background():\n",
|
||||||
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
|
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
|
||||||
|
@ -265,7 +265,7 @@
|
||||||
"# use this helper if needed to kill the server \n",
|
"# use this helper if needed to kill the server \n",
|
||||||
"def kill_llama_stack_server():\n",
|
"def kill_llama_stack_server():\n",
|
||||||
" # Kill any existing llama stack server processes\n",
|
" # Kill any existing llama stack server processes\n",
|
||||||
" os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
|
" os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -37,7 +37,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
"To learn more about torchtune: https://github.com/pytorch/torchtune\n",
|
"To learn more about torchtune: https://github.com/pytorch/torchtune\n",
|
||||||
"\n",
|
"\n",
|
||||||
"We will use [experimental-post-training](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/distributions/experimental-post-training) as the distribution template\n",
|
"We will use [experimental-post-training](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/templates/experimental-post-training) as the distribution template\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#### 0.0. Prerequisite: Have an OpenAI API key\n",
|
"#### 0.0. Prerequisite: Have an OpenAI API key\n",
|
||||||
"In this showcase, we will use [braintrust](https://www.braintrust.dev/) as scoring provider for eval and it uses OpenAI model as judge model for scoring. So, you need to get an API key from [OpenAI developer platform](https://platform.openai.com/docs/overview).\n",
|
"In this showcase, we will use [braintrust](https://www.braintrust.dev/) as scoring provider for eval and it uses OpenAI model as judge model for scoring. So, you need to get an API key from [OpenAI developer platform](https://platform.openai.com/docs/overview).\n",
|
||||||
|
@ -2864,7 +2864,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"!llama stack build --distro experimental-post-training --image-type venv --image-name __system__"
|
"!llama stack build --template experimental-post-training --image-type venv --image-name __system__"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -3216,19 +3216,19 @@
|
||||||
"INFO:datasets:Duckdb version 1.1.3 available.\n",
|
"INFO:datasets:Duckdb version 1.1.3 available.\n",
|
||||||
"INFO:datasets:TensorFlow version 2.18.0 available.\n",
|
"INFO:datasets:TensorFlow version 2.18.0 available.\n",
|
||||||
"INFO:datasets:JAX version 0.4.33 available.\n",
|
"INFO:datasets:JAX version 0.4.33 available.\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: basic::equality served by basic\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: basic::equality served by basic\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: basic::subset_of served by basic\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: basic::subset_of served by basic\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: basic::regex_parser_multiple_choice_answer served by basic\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: braintrust::factuality served by braintrust\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::factuality served by braintrust\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-correctness served by braintrust\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-correctness served by braintrust\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-relevancy served by braintrust\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-relevancy served by braintrust\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: braintrust::answer-similarity served by braintrust\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::answer-similarity served by braintrust\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: braintrust::faithfulness served by braintrust\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::faithfulness served by braintrust\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-entity-recall served by braintrust\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-entity-recall served by braintrust\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-precision served by braintrust\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-precision served by braintrust\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-recall served by braintrust\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-recall served by braintrust\n",
|
||||||
"INFO:llama_stack.core.stack:Scoring_fns: braintrust::context-relevancy served by braintrust\n",
|
"INFO:llama_stack.distribution.stack:Scoring_fns: braintrust::context-relevancy served by braintrust\n",
|
||||||
"INFO:llama_stack.core.stack:\n"
|
"INFO:llama_stack.distribution.stack:\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -3448,7 +3448,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
"os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
|
"os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||||||
"client = LlamaStackAsLibraryClient(\"experimental-post-training\")\n",
|
"client = LlamaStackAsLibraryClient(\"experimental-post-training\")\n",
|
||||||
"_ = client.initialize()"
|
"_ = client.initialize()"
|
||||||
]
|
]
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# NBVAL_SKIP\n",
|
"# NBVAL_SKIP\n",
|
||||||
"!pip install -U llama-stack\n",
|
"!pip install -U llama-stack\n",
|
||||||
"!UV_SYSTEM_PYTHON=1 llama stack build --distro fireworks --image-type venv"
|
"!UV_SYSTEM_PYTHON=1 llama stack build --template fireworks --image-type venv"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -48,7 +48,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from llama_stack_client import LlamaStackClient, Agent\n",
|
"from llama_stack_client import LlamaStackClient, Agent\n",
|
||||||
"from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||||||
"from rich.pretty import pprint\n",
|
"from rich.pretty import pprint\n",
|
||||||
"import json\n",
|
"import json\n",
|
||||||
"import uuid\n",
|
"import uuid\n",
|
||||||
|
|
|
@ -57,7 +57,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# NBVAL_SKIP\n",
|
"# NBVAL_SKIP\n",
|
||||||
"!UV_SYSTEM_PYTHON=1 llama stack build --distro together --image-type venv"
|
"!UV_SYSTEM_PYTHON=1 llama stack build --template together --image-type venv"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -661,7 +661,7 @@
|
||||||
"except ImportError:\n",
|
"except ImportError:\n",
|
||||||
" print(\"Not in Google Colab environment\")\n",
|
" print(\"Not in Google Colab environment\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||||||
"\n",
|
"\n",
|
||||||
"client = LlamaStackAsLibraryClient(\"together\")\n",
|
"client = LlamaStackAsLibraryClient(\"together\")\n",
|
||||||
"_ = client.initialize()"
|
"_ = client.initialize()"
|
||||||
|
|
|
@ -35,7 +35,7 @@
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from llama_stack_client import LlamaStackClient, Agent\n",
|
"from llama_stack_client import LlamaStackClient, Agent\n",
|
||||||
"from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||||||
"from rich.pretty import pprint\n",
|
"from rich.pretty import pprint\n",
|
||||||
"import json\n",
|
"import json\n",
|
||||||
"import uuid\n",
|
"import uuid\n",
|
||||||
|
|
|
@ -92,7 +92,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"```bash\n",
|
"```bash\n",
|
||||||
"LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
|
"LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv\n",
|
||||||
"```"
|
"```"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -194,7 +194,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||||||
"\n",
|
"\n",
|
||||||
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
||||||
"client.initialize()"
|
"client.initialize()"
|
||||||
|
|
|
@ -81,7 +81,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"```bash\n",
|
"```bash\n",
|
||||||
"LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n",
|
"LLAMA_STACK_DIR=$(pwd) llama stack build --template nvidia --image-type venv\n",
|
||||||
"```"
|
"```"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -56,7 +56,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||||||
"\n",
|
"\n",
|
||||||
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
||||||
"client.initialize()"
|
"client.initialize()"
|
||||||
|
|
|
@ -56,7 +56,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||||||
"\n",
|
"\n",
|
||||||
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
||||||
"client.initialize()"
|
"client.initialize()"
|
||||||
|
|
|
@ -56,7 +56,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from llama_stack.core.library_client import LlamaStackAsLibraryClient\n",
|
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
|
||||||
"\n",
|
"\n",
|
||||||
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
"client = LlamaStackAsLibraryClient(\"nvidia\")\n",
|
||||||
"client.initialize()"
|
"client.initialize()"
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack.core/server/endpoints.py` using the `generate.py` utility.
|
The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
|
||||||
|
|
|
@ -17,7 +17,7 @@ import fire
|
||||||
import ruamel.yaml as yaml
|
import ruamel.yaml as yaml
|
||||||
|
|
||||||
from llama_stack.apis.version import LLAMA_STACK_API_VERSION # noqa: E402
|
from llama_stack.apis.version import LLAMA_STACK_API_VERSION # noqa: E402
|
||||||
from llama_stack.core.stack import LlamaStack # noqa: E402
|
from llama_stack.distribution.stack import LlamaStack # noqa: E402
|
||||||
|
|
||||||
from .pyopenapi.options import Options # noqa: E402
|
from .pyopenapi.options import Options # noqa: E402
|
||||||
from .pyopenapi.specification import Info, Server # noqa: E402
|
from .pyopenapi.specification import Info, Server # noqa: E402
|
||||||
|
|
|
@ -12,7 +12,7 @@ from typing import TextIO
|
||||||
from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
|
from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
|
||||||
|
|
||||||
from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
|
from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
|
||||||
from llama_stack.core.resolver import api_protocol_map
|
from llama_stack.distribution.resolver import api_protocol_map
|
||||||
|
|
||||||
from .generator import Generator
|
from .generator import Generator
|
||||||
from .options import Options
|
from .options import Options
|
||||||
|
|
|
@ -73,7 +73,7 @@ The API is defined in the [YAML](_static/llama-stack-spec.yaml) and [HTML](_stat
|
||||||
|
|
||||||
To prove out the API, we implemented a handful of use cases to make things more concrete. The [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps) repository contains [6 different examples](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) ranging from very basic to a multi turn agent.
|
To prove out the API, we implemented a handful of use cases to make things more concrete. The [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps) repository contains [6 different examples](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) ranging from very basic to a multi turn agent.
|
||||||
|
|
||||||
There is also a sample inference endpoint implementation in the [llama-stack](https://github.com/meta-llama/llama-stack/blob/main/llama_stack.core/server/server.py) repository.
|
There is also a sample inference endpoint implementation in the [llama-stack](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/distribution/server/server.py) repository.
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
|
|
|
@ -145,12 +145,12 @@
|
||||||
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
|
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
|
"# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
|
||||||
"!uv run --with llama-stack llama stack build --distro starter --image-type venv\n",
|
"!uv run --with llama-stack llama stack build --template starter --image-type venv\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def run_llama_stack_server_background():\n",
|
"def run_llama_stack_server_background():\n",
|
||||||
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
|
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
|
||||||
" process = subprocess.Popen(\n",
|
" process = subprocess.Popen(\n",
|
||||||
" f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv",
|
" f\"uv run --with llama-stack llama stack run starter --image-type venv --env INFERENCE_MODEL=llama3.2:3b\",\n",
|
||||||
" shell=True,\n",
|
" shell=True,\n",
|
||||||
" stdout=log_file,\n",
|
" stdout=log_file,\n",
|
||||||
" stderr=log_file,\n",
|
" stderr=log_file,\n",
|
||||||
|
@ -187,7 +187,7 @@
|
||||||
"# use this helper if needed to kill the server \n",
|
"# use this helper if needed to kill the server \n",
|
||||||
"def kill_llama_stack_server():\n",
|
"def kill_llama_stack_server():\n",
|
||||||
" # Kill any existing llama stack server processes\n",
|
" # Kill any existing llama stack server processes\n",
|
||||||
" os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
|
" os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
---
|
|
||||||
orphan: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# inline::meta-reference
|
# inline::meta-reference
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
---
|
|
||||||
orphan: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# remote::nvidia
|
# remote::nvidia
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
|
@ -43,7 +43,7 @@ We have built-in functionality to run the supported open-benckmarks using llama-
|
||||||
|
|
||||||
Spin up llama stack server with 'open-benchmark' template
|
Spin up llama stack server with 'open-benchmark' template
|
||||||
```
|
```
|
||||||
llama stack run llama_stack/distributions/open-benchmark/run.yaml
|
llama stack run llama_stack/templates/open-benchmark/run.yaml
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ To use the HF SFTTrainer in your Llama Stack project, follow these steps:
|
||||||
You can access the HuggingFace trainer via the `ollama` distribution:
|
You can access the HuggingFace trainer via the `ollama` distribution:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --distro starter --image-type venv
|
llama stack build --template starter --image-type venv
|
||||||
llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml
|
llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
---
|
|
||||||
orphan: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# inline::huggingface
|
# inline::huggingface
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
---
|
|
||||||
orphan: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# inline::torchtune
|
# inline::torchtune
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
---
|
|
||||||
orphan: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# remote::nvidia
|
# remote::nvidia
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
---
|
|
||||||
orphan: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# inline::basic
|
# inline::basic
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
---
|
|
||||||
orphan: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# inline::braintrust
|
# inline::braintrust
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
---
|
|
||||||
orphan: true
|
|
||||||
---
|
|
||||||
|
|
||||||
# inline::llm-as-judge
|
# inline::llm-as-judge
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
|
@ -111,7 +111,7 @@ name = "llama-stack-api-weather"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Weather API for Llama Stack"
|
description = "Weather API for Llama Stack"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.10"
|
||||||
dependencies = ["llama-stack", "pydantic"]
|
dependencies = ["llama-stack", "pydantic"]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
@ -231,7 +231,7 @@ name = "llama-stack-provider-kaze"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = "Kaze weather provider for Llama Stack"
|
description = "Kaze weather provider for Llama Stack"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.10"
|
||||||
dependencies = ["llama-stack", "pydantic", "aiohttp"]
|
dependencies = ["llama-stack", "pydantic", "aiohttp"]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
@ -355,7 +355,7 @@ server:
|
||||||
8. Run the server:
|
8. Run the server:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
|
python -m llama_stack.distribution.server.server --yaml-config ~/.llama/run-byoa.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
9. Test the API:
|
9. Test the API:
|
||||||
|
|
|
@ -97,11 +97,11 @@ To start the Llama Stack Playground, run the following commands:
|
||||||
1. Start up the Llama Stack API server
|
1. Start up the Llama Stack API server
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --distro together --image-type venv
|
llama stack build --template together --image-type conda
|
||||||
llama stack run together
|
llama stack run together
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Start Streamlit UI
|
2. Start Streamlit UI
|
||||||
```bash
|
```bash
|
||||||
uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
|
uv run --with ".[ui]" streamlit run llama_stack/distribution/ui/app.py
|
||||||
```
|
```
|
||||||
|
|
|
@ -2,9 +2,7 @@
|
||||||
|
|
||||||
Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.
|
Llama Stack (LLS) provides two different APIs for building AI applications with tool calling capabilities: the **Agents API** and the **OpenAI Responses API**. While both enable AI systems to use tools, and maintain full conversation history, they serve different use cases and have distinct characteristics.
|
||||||
|
|
||||||
```{note}
|
> **Note:** For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
|
||||||
For simple and basic inferencing, you may want to use the [Chat Completions API](https://llama-stack.readthedocs.io/en/latest/providers/index.html#chat-completions) directly, before progressing to Agents or Responses API.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Overview
|
## Overview
|
||||||
|
|
||||||
|
|
|
@ -76,9 +76,7 @@ Features:
|
||||||
- Context retrieval with token limits
|
- Context retrieval with token limits
|
||||||
|
|
||||||
|
|
||||||
```{note}
|
> **Note:** By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
|
||||||
By default, llama stack run.yaml defines toolgroups for web search, wolfram alpha and rag, that are provided by tavily-search, wolfram-alpha and rag providers.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Model Context Protocol (MCP)
|
## Model Context Protocol (MCP)
|
||||||
|
|
||||||
|
|
|
@ -18,4 +18,3 @@ We are working on adding a few more APIs to complete the application lifecycle.
|
||||||
- **Batch Inference**: run inference on a dataset of inputs
|
- **Batch Inference**: run inference on a dataset of inputs
|
||||||
- **Batch Agents**: run agents on a dataset of inputs
|
- **Batch Agents**: run agents on a dataset of inputs
|
||||||
- **Synthetic Data Generation**: generate synthetic data for model development
|
- **Synthetic Data Generation**: generate synthetic data for model development
|
||||||
- **Batches**: OpenAI-compatible batch management for inference
|
|
||||||
|
|
|
@ -131,7 +131,6 @@ html_static_path = ["../_static"]
|
||||||
def setup(app):
|
def setup(app):
|
||||||
app.add_css_file("css/my_theme.css")
|
app.add_css_file("css/my_theme.css")
|
||||||
app.add_js_file("js/detect_theme.js")
|
app.add_js_file("js/detect_theme.js")
|
||||||
app.add_js_file("js/keyboard_shortcuts.js")
|
|
||||||
|
|
||||||
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
|
||||||
url = f"https://hub.docker.com/r/llamastack/{text}"
|
url = f"https://hub.docker.com/r/llamastack/{text}"
|
||||||
|
|
|
@ -2,38 +2,13 @@
|
||||||
```{include} ../../../CONTRIBUTING.md
|
```{include} ../../../CONTRIBUTING.md
|
||||||
```
|
```
|
||||||
|
|
||||||
## Adding a New Provider
|
See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
|
||||||
|
|
||||||
|
|
||||||
See:
|
|
||||||
- [Adding a New API Provider Page](new_api_provider.md) which describes how to add new API providers to the Stack.
|
|
||||||
- [Vector Database Page](new_vector_database.md) which describes how to add a new vector databases with Llama Stack.
|
|
||||||
- [External Provider Page](../providers/external/index.md) which describes how to add external providers to the Stack.
|
|
||||||
|
|
||||||
```{toctree}
|
```{toctree}
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:hidden:
|
:hidden:
|
||||||
|
|
||||||
new_api_provider
|
new_api_provider
|
||||||
new_vector_database
|
|
||||||
```
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
|
|
||||||
```{include} ../../../tests/README.md
|
|
||||||
```
|
|
||||||
|
|
||||||
## Advanced Topics
|
|
||||||
|
|
||||||
For developers who need deeper understanding of the testing system internals:
|
|
||||||
|
|
||||||
```{toctree}
|
|
||||||
:maxdepth: 1
|
|
||||||
|
|
||||||
testing/record-replay
|
|
||||||
```
|
|
||||||
|
|
||||||
### Benchmarking
|
|
||||||
|
|
||||||
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -6,7 +6,7 @@ This guide will walk you through the process of adding a new API provider to Lla
|
||||||
- Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
|
- Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
|
||||||
- Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
|
- Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
|
||||||
- Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
|
- Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
|
||||||
- Update any distribution {repopath}`Templates::llama_stack/distributions/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
|
- Update any distribution {repopath}`Templates::llama_stack/templates/` `build.yaml` and `run.yaml` files if they should include your provider by default. Run {repopath}`./scripts/distro_codegen.py` if necessary. Note that `distro_codegen.py` will fail if the new provider causes any distribution template to attempt to import provider-specific dependencies. This usually means the distribution's `get_distribution_template()` code path should only import any necessary Config or model alias definitions from each provider and not the provider's actual implementation.
|
||||||
|
|
||||||
|
|
||||||
Here are some example PRs to help you get started:
|
Here are some example PRs to help you get started:
|
||||||
|
@ -52,7 +52,7 @@ def get_base_url(self) -> str:
|
||||||
|
|
||||||
## Testing the Provider
|
## Testing the Provider
|
||||||
|
|
||||||
Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`.
|
Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --template together`.
|
||||||
|
|
||||||
### 1. Integration Testing
|
### 1. Integration Testing
|
||||||
|
|
||||||
|
|
|
@ -1,75 +0,0 @@
|
||||||
# Adding a New Vector Database
|
|
||||||
|
|
||||||
This guide will walk you through the process of adding a new vector database to Llama Stack.
|
|
||||||
|
|
||||||
> **_NOTE:_** Here's an example Pull Request of the [Milvus Vector Database Provider](https://github.com/meta-llama/llama-stack/pull/1467).
|
|
||||||
|
|
||||||
Vector Database providers are used to store and retrieve vector embeddings. Vector databases are not limited to vector
|
|
||||||
search but can support keyword and hybrid search. Additionally, vector database can also support operations like
|
|
||||||
filtering, sorting, and aggregating vectors.
|
|
||||||
|
|
||||||
## Steps to Add a New Vector Database Provider
|
|
||||||
1. **Choose the Database Type**: Determine if your vector database is a remote service, inline, or both.
|
|
||||||
- Remote databases make requests to external services, while inline databases execute locally. Some providers support both.
|
|
||||||
2. **Implement the Provider**: Create a new provider class that inherits from `VectorDatabaseProvider` and implements the required methods.
|
|
||||||
- Implement methods for vector storage, retrieval, search, and any additional features your database supports.
|
|
||||||
- You will need to implement the following methods for `YourVectorIndex`:
|
|
||||||
- `YourVectorIndex.create()`
|
|
||||||
- `YourVectorIndex.initialize()`
|
|
||||||
- `YourVectorIndex.add_chunks()`
|
|
||||||
- `YourVectorIndex.delete_chunk()`
|
|
||||||
- `YourVectorIndex.query_vector()`
|
|
||||||
- `YourVectorIndex.query_keyword()`
|
|
||||||
- `YourVectorIndex.query_hybrid()`
|
|
||||||
- You will need to implement the following methods for `YourVectorIOAdapter`:
|
|
||||||
- `YourVectorIOAdapter.initialize()`
|
|
||||||
- `YourVectorIOAdapter.shutdown()`
|
|
||||||
- `YourVectorIOAdapter.list_vector_dbs()`
|
|
||||||
- `YourVectorIOAdapter.register_vector_db()`
|
|
||||||
- `YourVectorIOAdapter.unregister_vector_db()`
|
|
||||||
- `YourVectorIOAdapter.insert_chunks()`
|
|
||||||
- `YourVectorIOAdapter.query_chunks()`
|
|
||||||
- `YourVectorIOAdapter.delete_chunks()`
|
|
||||||
3. **Add to Registry**: Register your provider in the appropriate registry file.
|
|
||||||
- Update {repopath}`llama_stack/providers/registry/vector_io.py` to include your new provider.
|
|
||||||
```python
|
|
||||||
from llama_stack.providers.registry.specs import InlineProviderSpec
|
|
||||||
from llama_stack.providers.registry.api import Api
|
|
||||||
|
|
||||||
InlineProviderSpec(
|
|
||||||
api=Api.vector_io,
|
|
||||||
provider_type="inline::milvus",
|
|
||||||
pip_packages=["pymilvus>=2.4.10"],
|
|
||||||
module="llama_stack.providers.inline.vector_io.milvus",
|
|
||||||
config_class="llama_stack.providers.inline.vector_io.milvus.MilvusVectorIOConfig",
|
|
||||||
api_dependencies=[Api.inference],
|
|
||||||
optional_api_dependencies=[Api.files],
|
|
||||||
description="",
|
|
||||||
),
|
|
||||||
```
|
|
||||||
4. **Add Tests**: Create unit tests and integration tests for your provider in the `tests/` directory.
|
|
||||||
- Unit Tests
|
|
||||||
- By following the structure of the class methods, you will be able to easily run unit and integration tests for your database.
|
|
||||||
1. You have to configure the tests for your provide in `/tests/unit/providers/vector_io/conftest.py`.
|
|
||||||
2. Update the `vector_provider` fixture to include your provider if they are an inline provider.
|
|
||||||
3. Create a `your_vectorprovider_index` fixture that initializes your vector index.
|
|
||||||
4. Create a `your_vectorprovider_adapter` fixture that initializes your vector adapter.
|
|
||||||
5. Add your provider to the `vector_io_providers` fixture dictionary.
|
|
||||||
- Please follow the naming convention of `your_vectorprovider_index` and `your_vectorprovider_adapter` as the tests require this to execute properly.
|
|
||||||
- Integration Tests
|
|
||||||
- Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality.
|
|
||||||
- The two set of integration tests are:
|
|
||||||
- `tests/integration/vector_io/test_vector_io.py`: This file tests registration, insertion, and retrieval.
|
|
||||||
- `tests/integration/vector_io/test_openai_vector_stores.py`: These tests are for OpenAI-compatible vector stores and test the OpenAI API compatibility.
|
|
||||||
- You will need to update `skip_if_provider_doesnt_support_openai_vector_stores` to include your provider as well as `skip_if_provider_doesnt_support_openai_vector_stores_search` to test the appropriate search functionality.
|
|
||||||
- Running the tests in the GitHub CI
|
|
||||||
- You will need to update the `.github/workflows/integration-vector-io-tests.yml` file to include your provider.
|
|
||||||
- If your provider is a remote provider, you will also have to add a container to spin up and run it in the action.
|
|
||||||
- Updating the pyproject.yml
|
|
||||||
- If you are adding tests for the `inline` provider you will have to update the `unit` group.
|
|
||||||
- `uv add new_pip_package --group unit`
|
|
||||||
- If you are adding tests for the `remote` provider you will have to update the `test` group, which is used in the GitHub CI for integration tests.
|
|
||||||
- `uv add new_pip_package --group test`
|
|
||||||
5. **Update Documentation**: Please update the documentation for end users
|
|
||||||
- Generate the provider documentation by running {repopath}`./scripts/provider_codegen.py`.
|
|
||||||
- Update the autogenerated content in the registry/vector_io.py file with information about your provider. Please see other providers for examples.
|
|
6
docs/source/contributing/testing.md
Normal file
6
docs/source/contributing/testing.md
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
# Testing Llama Stack
|
||||||
|
|
||||||
|
Tests are of three different kinds:
|
||||||
|
- Unit tests
|
||||||
|
- Provider focused integration tests
|
||||||
|
- Client SDK tests
|
|
@ -1,234 +0,0 @@
|
||||||
# Record-Replay System
|
|
||||||
|
|
||||||
Understanding how Llama Stack captures and replays API interactions for testing.
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The record-replay system solves a fundamental challenge in AI testing: how do you test against expensive, non-deterministic APIs without breaking the bank or dealing with flaky tests?
|
|
||||||
|
|
||||||
The solution: intercept API calls, store real responses, and replay them later. This gives you real API behavior without the cost or variability.
|
|
||||||
|
|
||||||
## How It Works
|
|
||||||
|
|
||||||
### Request Hashing
|
|
||||||
|
|
||||||
Every API request gets converted to a deterministic hash for lookup:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def normalize_request(method: str, url: str, headers: dict, body: dict) -> str:
|
|
||||||
normalized = {
|
|
||||||
"method": method.upper(),
|
|
||||||
"endpoint": urlparse(url).path, # Just the path, not full URL
|
|
||||||
"body": body, # Request parameters
|
|
||||||
}
|
|
||||||
return hashlib.sha256(json.dumps(normalized, sort_keys=True).encode()).hexdigest()
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key insight:** The hashing is intentionally precise. Different whitespace, float precision, or parameter order produces different hashes. This prevents subtle bugs from false cache hits.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# These produce DIFFERENT hashes:
|
|
||||||
{"content": "Hello world"}
|
|
||||||
{"content": "Hello world\n"}
|
|
||||||
{"temperature": 0.7}
|
|
||||||
{"temperature": 0.7000001}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Client Interception
|
|
||||||
|
|
||||||
The system patches OpenAI and Ollama client methods to intercept calls before they leave your application. This happens transparently - your test code doesn't change.
|
|
||||||
|
|
||||||
### Storage Architecture
|
|
||||||
|
|
||||||
Recordings use a two-tier storage system optimized for both speed and debuggability:
|
|
||||||
|
|
||||||
```
|
|
||||||
recordings/
|
|
||||||
├── index.sqlite # Fast lookup by request hash
|
|
||||||
└── responses/
|
|
||||||
├── abc123def456.json # Individual response files
|
|
||||||
└── def789ghi012.json
|
|
||||||
```
|
|
||||||
|
|
||||||
**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
|
|
||||||
|
|
||||||
**JSON files** store complete request/response pairs in human-readable format for debugging.
|
|
||||||
|
|
||||||
## Recording Modes
|
|
||||||
|
|
||||||
### LIVE Mode
|
|
||||||
|
|
||||||
Direct API calls with no recording or replay:
|
|
||||||
|
|
||||||
```python
|
|
||||||
with inference_recording(mode=InferenceMode.LIVE):
|
|
||||||
response = await client.chat.completions.create(...)
|
|
||||||
```
|
|
||||||
|
|
||||||
Use for initial development and debugging against real APIs.
|
|
||||||
|
|
||||||
### RECORD Mode
|
|
||||||
|
|
||||||
Captures API interactions while passing through real responses:
|
|
||||||
|
|
||||||
```python
|
|
||||||
with inference_recording(mode=InferenceMode.RECORD, storage_dir="./recordings"):
|
|
||||||
response = await client.chat.completions.create(...)
|
|
||||||
# Real API call made, response captured AND returned
|
|
||||||
```
|
|
||||||
|
|
||||||
The recording process:
|
|
||||||
1. Request intercepted and hashed
|
|
||||||
2. Real API call executed
|
|
||||||
3. Response captured and serialized
|
|
||||||
4. Recording stored to disk
|
|
||||||
5. Original response returned to caller
|
|
||||||
|
|
||||||
### REPLAY Mode
|
|
||||||
|
|
||||||
Returns stored responses instead of making API calls:
|
|
||||||
|
|
||||||
```python
|
|
||||||
with inference_recording(mode=InferenceMode.REPLAY, storage_dir="./recordings"):
|
|
||||||
response = await client.chat.completions.create(...)
|
|
||||||
# No API call made, cached response returned instantly
|
|
||||||
```
|
|
||||||
|
|
||||||
The replay process:
|
|
||||||
1. Request intercepted and hashed
|
|
||||||
2. Hash looked up in SQLite index
|
|
||||||
3. Response loaded from JSON file
|
|
||||||
4. Response deserialized and returned
|
|
||||||
5. Error if no recording found
|
|
||||||
|
|
||||||
## Streaming Support
|
|
||||||
|
|
||||||
Streaming APIs present a unique challenge: how do you capture an async generator?
|
|
||||||
|
|
||||||
### The Problem
|
|
||||||
|
|
||||||
```python
|
|
||||||
# How do you record this?
|
|
||||||
async for chunk in client.chat.completions.create(stream=True):
|
|
||||||
process(chunk)
|
|
||||||
```
|
|
||||||
|
|
||||||
### The Solution
|
|
||||||
|
|
||||||
The system captures all chunks immediately before yielding any:
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def handle_streaming_record(response):
|
|
||||||
# Capture complete stream first
|
|
||||||
chunks = []
|
|
||||||
async for chunk in response:
|
|
||||||
chunks.append(chunk)
|
|
||||||
|
|
||||||
# Store complete recording
|
|
||||||
storage.store_recording(
|
|
||||||
request_hash, request_data, {"body": chunks, "is_streaming": True}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Return generator that replays captured chunks
|
|
||||||
async def replay_stream():
|
|
||||||
for chunk in chunks:
|
|
||||||
yield chunk
|
|
||||||
|
|
||||||
return replay_stream()
|
|
||||||
```
|
|
||||||
|
|
||||||
This ensures:
|
|
||||||
- **Complete capture** - The entire stream is saved atomically
|
|
||||||
- **Interface preservation** - The returned object behaves like the original API
|
|
||||||
- **Deterministic replay** - Same chunks in the same order every time
|
|
||||||
|
|
||||||
## Serialization
|
|
||||||
|
|
||||||
API responses contain complex Pydantic objects that need careful serialization:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def _serialize_response(response):
|
|
||||||
if hasattr(response, "model_dump"):
|
|
||||||
# Preserve type information for proper deserialization
|
|
||||||
return {
|
|
||||||
"__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
|
|
||||||
"__data__": response.model_dump(mode="json"),
|
|
||||||
}
|
|
||||||
return response
|
|
||||||
```
|
|
||||||
|
|
||||||
This preserves type safety - when replayed, you get the same Pydantic objects with all their validation and methods.
|
|
||||||
|
|
||||||
## Environment Integration
|
|
||||||
|
|
||||||
### Environment Variables
|
|
||||||
|
|
||||||
Control recording behavior globally:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export LLAMA_STACK_TEST_INFERENCE_MODE=replay
|
|
||||||
export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings
|
|
||||||
pytest tests/integration/
|
|
||||||
```
|
|
||||||
|
|
||||||
### Pytest Integration
|
|
||||||
|
|
||||||
The system integrates automatically based on environment variables, requiring no changes to test code.
|
|
||||||
|
|
||||||
## Debugging Recordings
|
|
||||||
|
|
||||||
### Inspecting Storage
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# See what's recorded
|
|
||||||
sqlite3 recordings/index.sqlite "SELECT endpoint, model, timestamp FROM recordings LIMIT 10;"
|
|
||||||
|
|
||||||
# View specific response
|
|
||||||
cat recordings/responses/abc123def456.json | jq '.response.body'
|
|
||||||
|
|
||||||
# Find recordings by endpoint
|
|
||||||
sqlite3 recordings/index.sqlite "SELECT * FROM recordings WHERE endpoint='/v1/chat/completions';"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Common Issues
|
|
||||||
|
|
||||||
**Hash mismatches:** Request parameters changed slightly between record and replay
|
|
||||||
```bash
|
|
||||||
# Compare request details
|
|
||||||
cat recordings/responses/abc123.json | jq '.request'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Serialization errors:** Response types changed between versions
|
|
||||||
```bash
|
|
||||||
# Re-record with updated types
|
|
||||||
rm recordings/responses/failing_hash.json
|
|
||||||
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_failing.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Missing recordings:** New test or changed parameters
|
|
||||||
```bash
|
|
||||||
# Record the missing interaction
|
|
||||||
LLAMA_STACK_TEST_INFERENCE_MODE=record pytest test_new.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Design Decisions
|
|
||||||
|
|
||||||
### Why Not Mocks?
|
|
||||||
|
|
||||||
Traditional mocking breaks down with AI APIs because:
|
|
||||||
- Response structures are complex and evolve frequently
|
|
||||||
- Streaming behavior is hard to mock correctly
|
|
||||||
- Edge cases in real APIs get missed
|
|
||||||
- Mocks become brittle maintenance burdens
|
|
||||||
|
|
||||||
### Why Precise Hashing?
|
|
||||||
|
|
||||||
Loose hashing (normalizing whitespace, rounding floats) seems convenient but hides bugs. If a test changes slightly, you want to know about it rather than accidentally getting the wrong cached response.
|
|
||||||
|
|
||||||
### Why JSON + SQLite?
|
|
||||||
|
|
||||||
- **JSON** - Human readable, diff-friendly, easy to inspect and modify
|
|
||||||
- **SQLite** - Fast indexed lookups without loading response bodies
|
|
||||||
- **Hybrid** - Best of both worlds for different use cases
|
|
||||||
|
|
||||||
This system provides reliable, fast testing against real AI APIs while maintaining the ability to debug issues when they arise.
|
|
|
@ -174,7 +174,7 @@ spec:
|
||||||
- name: llama-stack
|
- name: llama-stack
|
||||||
image: localhost/llama-stack-run-k8s:latest
|
image: localhost/llama-stack-run-k8s:latest
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"]
|
command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/app/config.yaml"]
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 5000
|
- containerPort: 5000
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
|
|
|
@ -47,37 +47,30 @@ pip install -e .
|
||||||
```
|
```
|
||||||
Use the CLI to build your distribution.
|
Use the CLI to build your distribution.
|
||||||
The main points to consider are:
|
The main points to consider are:
|
||||||
1. **Image Type** - Do you want a venv environment or a Container (eg. Docker)
|
1. **Image Type** - Do you want a Conda / venv environment or a Container (eg. Docker)
|
||||||
2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
|
2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
|
||||||
3. **Config** - Do you want to use a pre-existing config file to build your distribution?
|
3. **Config** - Do you want to use a pre-existing config file to build your distribution?
|
||||||
|
|
||||||
```
|
```
|
||||||
llama stack build -h
|
llama stack build -h
|
||||||
usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
|
usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]
|
||||||
[--run] [--providers PROVIDERS]
|
|
||||||
|
|
||||||
Build a Llama stack container
|
Build a Llama stack container
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
--config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
|
--config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will
|
||||||
enter information interactively (default: None)
|
be prompted to enter information interactively (default: None)
|
||||||
--template TEMPLATE (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
|
--template TEMPLATE Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
|
||||||
None)
|
--list-templates Show the available templates for building a Llama Stack distribution (default: False)
|
||||||
--distro DISTRIBUTION, --distribution DISTRIBUTION
|
--image-type {conda,container,venv}
|
||||||
Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
|
|
||||||
--list-distros, --list-distributions
|
|
||||||
Show the available distributions for building a Llama Stack distribution (default: False)
|
|
||||||
--image-type {container,venv}
|
|
||||||
Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
|
Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
|
||||||
--image-name IMAGE_NAME
|
--image-name IMAGE_NAME
|
||||||
[for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
|
[for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active environment will be used if
|
||||||
None)
|
found. (default: None)
|
||||||
--print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
|
--print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
|
||||||
--run Run the stack after building using the same image type, name, and other applicable arguments (default: False)
|
--run Run the stack after building using the same image type, name, and other applicable arguments (default: False)
|
||||||
--providers PROVIDERS
|
|
||||||
Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
|
|
||||||
API. (default: None)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
|
After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
|
||||||
|
@ -148,7 +141,7 @@ You may then pick a template to build your distribution with providers fitted to
|
||||||
|
|
||||||
For example, to build a distribution with TGI as the inference provider, you can run:
|
For example, to build a distribution with TGI as the inference provider, you can run:
|
||||||
```
|
```
|
||||||
$ llama stack build --distro starter
|
$ llama stack build --template starter
|
||||||
...
|
...
|
||||||
You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
|
You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
|
||||||
```
|
```
|
||||||
|
@ -166,7 +159,7 @@ It would be best to start with a template and understand the structure of the co
|
||||||
llama stack build
|
llama stack build
|
||||||
|
|
||||||
> Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
|
> Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
|
||||||
> Enter the image type you want your Llama Stack to be built as (container or venv): venv
|
> Enter the image type you want your Llama Stack to be built as (container or conda or venv): conda
|
||||||
|
|
||||||
Llama Stack is composed of several APIs working together. Let's select
|
Llama Stack is composed of several APIs working together. Let's select
|
||||||
the provider types (implementations) you want to use for these APIs.
|
the provider types (implementations) you want to use for these APIs.
|
||||||
|
@ -191,10 +184,10 @@ You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack
|
||||||
:::{tab-item} Building from a pre-existing build config file
|
:::{tab-item} Building from a pre-existing build config file
|
||||||
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
|
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
|
||||||
|
|
||||||
- The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
|
- The config file will be of contents like the ones in `llama_stack/templates/*build.yaml`.
|
||||||
|
|
||||||
```
|
```
|
||||||
llama stack build --config llama_stack/distributions/starter/build.yaml
|
llama stack build --config llama_stack/templates/starter/build.yaml
|
||||||
```
|
```
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
@ -260,11 +253,11 @@ Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podm
|
||||||
To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
|
To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
|
||||||
|
|
||||||
```
|
```
|
||||||
llama stack build --distro starter --image-type container
|
llama stack build --template starter --image-type container
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
$ llama stack build --distro starter --image-type container
|
$ llama stack build --template starter --image-type container
|
||||||
...
|
...
|
||||||
Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
|
Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
|
||||||
...
|
...
|
||||||
|
@ -319,7 +312,7 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
|
||||||
```
|
```
|
||||||
llama stack run -h
|
llama stack run -h
|
||||||
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE]
|
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE]
|
||||||
[--image-type {venv}] [--enable-ui]
|
[--image-type {conda,venv}] [--enable-ui]
|
||||||
[config | template]
|
[config | template]
|
||||||
|
|
||||||
Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
|
Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
|
||||||
|
@ -333,8 +326,8 @@ options:
|
||||||
--image-name IMAGE_NAME
|
--image-name IMAGE_NAME
|
||||||
Name of the image to run. Defaults to the current environment (default: None)
|
Name of the image to run. Defaults to the current environment (default: None)
|
||||||
--env KEY=VALUE Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: None)
|
--env KEY=VALUE Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: None)
|
||||||
--image-type {venv}
|
--image-type {conda,venv}
|
||||||
Image Type used during the build. This should be venv. (default: None)
|
Image Type used during the build. This can be either conda or venv. (default: None)
|
||||||
--enable-ui Start the UI server (default: False)
|
--enable-ui Start the UI server (default: False)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -349,6 +342,9 @@ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-
|
||||||
|
|
||||||
# Start using a venv
|
# Start using a venv
|
||||||
llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
|
llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
|
||||||
|
|
||||||
|
# Start using a conda environment
|
||||||
|
llama stack run --image-type conda ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -10,6 +10,7 @@ The default `run.yaml` files generated by templates are starting points for your
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
version: 2
|
version: 2
|
||||||
|
conda_env: ollama
|
||||||
apis:
|
apis:
|
||||||
- agents
|
- agents
|
||||||
- inference
|
- inference
|
||||||
|
|
|
@ -6,14 +6,14 @@ This avoids the overhead of setting up a server.
|
||||||
```bash
|
```bash
|
||||||
# setup
|
# setup
|
||||||
uv pip install llama-stack
|
uv pip install llama-stack
|
||||||
llama stack build --distro starter --image-type venv
|
llama stack build --template starter --image-type venv
|
||||||
```
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from llama_stack.core.library_client import LlamaStackAsLibraryClient
|
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
|
||||||
|
|
||||||
client = LlamaStackAsLibraryClient(
|
client = LlamaStackAsLibraryClient(
|
||||||
"starter",
|
"ollama",
|
||||||
# provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
|
# provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
|
||||||
provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
|
provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
|
||||||
)
|
)
|
||||||
|
|
|
@ -9,7 +9,6 @@ This section provides an overview of the distributions available in Llama Stack.
|
||||||
list_of_distributions
|
list_of_distributions
|
||||||
building_distro
|
building_distro
|
||||||
customizing_run_yaml
|
customizing_run_yaml
|
||||||
starting_llama_stack_server
|
|
||||||
importing_as_library
|
importing_as_library
|
||||||
configuration
|
configuration
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,156 +0,0 @@
|
||||||
# Llama Stack Benchmark Suite on Kubernetes
|
|
||||||
|
|
||||||
## Motivation
|
|
||||||
|
|
||||||
Performance benchmarking is critical for understanding the overhead and characteristics of the Llama Stack abstraction layer compared to direct inference engines like vLLM.
|
|
||||||
|
|
||||||
### Why This Benchmark Suite Exists
|
|
||||||
|
|
||||||
**Performance Validation**: The Llama Stack provides a unified API layer across multiple inference providers, but this abstraction introduces potential overhead. This benchmark suite quantifies the performance impact by comparing:
|
|
||||||
- Llama Stack inference (with vLLM backend)
|
|
||||||
- Direct vLLM inference calls
|
|
||||||
- Both under identical Kubernetes deployment conditions
|
|
||||||
|
|
||||||
**Production Readiness Assessment**: Real-world deployments require understanding performance characteristics under load. This suite simulates concurrent user scenarios with configurable parameters (duration, concurrency, request patterns) to validate production readiness.
|
|
||||||
|
|
||||||
**Regression Detection (TODO)**: As the Llama Stack evolves, this benchmark provides automated regression detection for performance changes. CI/CD pipelines can leverage these benchmarks to catch performance degradations before production deployments.
|
|
||||||
|
|
||||||
**Resource Planning**: By measuring throughput, latency percentiles, and resource utilization patterns, teams can make informed decisions about:
|
|
||||||
- Kubernetes resource allocation (CPU, memory, GPU)
|
|
||||||
- Auto-scaling configurations
|
|
||||||
- Cost optimization strategies
|
|
||||||
|
|
||||||
### Key Metrics Captured
|
|
||||||
|
|
||||||
The benchmark suite measures critical performance indicators:
|
|
||||||
- **Throughput**: Requests per second under sustained load
|
|
||||||
- **Latency Distribution**: P50, P95, P99 response times
|
|
||||||
- **Time to First Token (TTFT)**: Critical for streaming applications
|
|
||||||
- **Error Rates**: Request failures and timeout analysis
|
|
||||||
|
|
||||||
This data enables data-driven architectural decisions and performance optimization efforts.
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
**1. Deploy base k8s infrastructure:**
|
|
||||||
```bash
|
|
||||||
cd ../k8s
|
|
||||||
./apply.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
**2. Deploy benchmark components:**
|
|
||||||
```bash
|
|
||||||
cd ../k8s-benchmark
|
|
||||||
./apply.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
**3. Verify deployment:**
|
|
||||||
```bash
|
|
||||||
kubectl get pods
|
|
||||||
# Should see: llama-stack-benchmark-server, vllm-server, etc.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Quick Start
|
|
||||||
|
|
||||||
### Basic Benchmarks
|
|
||||||
|
|
||||||
**Benchmark Llama Stack (default):**
|
|
||||||
```bash
|
|
||||||
cd docs/source/distributions/k8s-benchmark/
|
|
||||||
./run-benchmark.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
**Benchmark vLLM direct:**
|
|
||||||
```bash
|
|
||||||
./run-benchmark.sh --target vllm
|
|
||||||
```
|
|
||||||
|
|
||||||
### Custom Configuration
|
|
||||||
|
|
||||||
**Extended benchmark with high concurrency:**
|
|
||||||
```bash
|
|
||||||
./run-benchmark.sh --target vllm --duration 120 --concurrent 20
|
|
||||||
```
|
|
||||||
|
|
||||||
**Short test run:**
|
|
||||||
```bash
|
|
||||||
./run-benchmark.sh --target stack --duration 30 --concurrent 5
|
|
||||||
```
|
|
||||||
|
|
||||||
## Command Reference
|
|
||||||
|
|
||||||
### run-benchmark.sh Options
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./run-benchmark.sh [options]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
-t, --target <stack|vllm> Target to benchmark (default: stack)
|
|
||||||
-d, --duration <seconds> Duration in seconds (default: 60)
|
|
||||||
-c, --concurrent <users> Number of concurrent users (default: 10)
|
|
||||||
-h, --help Show help message
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
./run-benchmark.sh --target vllm # Benchmark vLLM direct
|
|
||||||
./run-benchmark.sh --target stack # Benchmark Llama Stack
|
|
||||||
./run-benchmark.sh -t vllm -d 120 -c 20 # vLLM with 120s, 20 users
|
|
||||||
```
|
|
||||||
|
|
||||||
## Local Testing
|
|
||||||
|
|
||||||
### Running Benchmark Locally
|
|
||||||
|
|
||||||
For local development without Kubernetes:
|
|
||||||
|
|
||||||
**1. Start OpenAI mock server:**
|
|
||||||
```bash
|
|
||||||
uv run python openai-mock-server.py --port 8080
|
|
||||||
```
|
|
||||||
|
|
||||||
**2. Run benchmark against mock server:**
|
|
||||||
```bash
|
|
||||||
uv run python benchmark.py \
|
|
||||||
--base-url http://localhost:8080/v1 \
|
|
||||||
--model mock-inference \
|
|
||||||
--duration 30 \
|
|
||||||
--concurrent 5
|
|
||||||
```
|
|
||||||
|
|
||||||
**3. Test against local vLLM server:**
|
|
||||||
```bash
|
|
||||||
# If you have vLLM running locally on port 8000
|
|
||||||
uv run python benchmark.py \
|
|
||||||
--base-url http://localhost:8000/v1 \
|
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
||||||
--duration 30 \
|
|
||||||
--concurrent 5
|
|
||||||
```
|
|
||||||
|
|
||||||
**4. Profile the running server:**
|
|
||||||
```bash
|
|
||||||
./profile_running_server.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
### OpenAI Mock Server
|
|
||||||
|
|
||||||
The `openai-mock-server.py` provides:
|
|
||||||
- **OpenAI-compatible API** for testing without real models
|
|
||||||
- **Configurable streaming delay** via `STREAM_DELAY_SECONDS` env var
|
|
||||||
- **Consistent responses** for reproducible benchmarks
|
|
||||||
- **Lightweight testing** without GPU requirements
|
|
||||||
|
|
||||||
**Mock server usage:**
|
|
||||||
```bash
|
|
||||||
uv run python openai-mock-server.py --port 8080
|
|
||||||
```
|
|
||||||
|
|
||||||
The mock server is also deployed in k8s as `openai-mock-service:8080` and can be used by changing the Llama Stack configuration to use the `mock-vllm-inference` provider.
|
|
||||||
|
|
||||||
## Files in this Directory
|
|
||||||
|
|
||||||
- `benchmark.py` - Core benchmark script with async streaming support
|
|
||||||
- `run-benchmark.sh` - Main script with target selection and configuration
|
|
||||||
- `openai-mock-server.py` - Mock OpenAI API server for local testing
|
|
||||||
- `README.md` - This documentation file
|
|
|
@ -1,36 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
# Deploys the benchmark-specific components on top of the base k8s deployment (../k8s/apply.sh).
|
|
||||||
|
|
||||||
export STREAM_DELAY_SECONDS=0.005
|
|
||||||
|
|
||||||
export POSTGRES_USER=llamastack
|
|
||||||
export POSTGRES_DB=llamastack
|
|
||||||
export POSTGRES_PASSWORD=llamastack
|
|
||||||
|
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
|
||||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
|
||||||
|
|
||||||
export MOCK_INFERENCE_MODEL=mock-inference
|
|
||||||
|
|
||||||
export MOCK_INFERENCE_URL=openai-mock-service:8080
|
|
||||||
|
|
||||||
export BENCHMARK_INFERENCE_MODEL=$INFERENCE_MODEL
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
set -x
|
|
||||||
|
|
||||||
# Deploy benchmark-specific components
|
|
||||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
|
||||||
--dry-run=client -o yaml > stack-configmap.yaml
|
|
||||||
|
|
||||||
kubectl apply --validate=false -f stack-configmap.yaml
|
|
||||||
|
|
||||||
# Deploy our custom llama stack server (overriding the base one)
|
|
||||||
envsubst < stack-k8s.yaml.template | kubectl apply --validate=false -f -
|
|
|
@ -1,267 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
"""
|
|
||||||
Simple benchmark script for Llama Stack with OpenAI API compatibility.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import statistics
|
|
||||||
import time
|
|
||||||
from typing import Tuple
|
|
||||||
import aiohttp
|
|
||||||
|
|
||||||
|
|
||||||
class BenchmarkStats:
|
|
||||||
def __init__(self):
|
|
||||||
self.response_times = []
|
|
||||||
self.ttft_times = []
|
|
||||||
self.chunks_received = []
|
|
||||||
self.errors = []
|
|
||||||
self.success_count = 0
|
|
||||||
self.total_requests = 0
|
|
||||||
self.concurrent_users = 0
|
|
||||||
self.start_time = None
|
|
||||||
self.end_time = None
|
|
||||||
self._lock = asyncio.Lock()
|
|
||||||
|
|
||||||
async def add_result(self, response_time: float, chunks: int, ttft: float = None, error: str = None):
|
|
||||||
async with self._lock:
|
|
||||||
self.total_requests += 1
|
|
||||||
if error:
|
|
||||||
self.errors.append(error)
|
|
||||||
else:
|
|
||||||
self.success_count += 1
|
|
||||||
self.response_times.append(response_time)
|
|
||||||
self.chunks_received.append(chunks)
|
|
||||||
if ttft is not None:
|
|
||||||
self.ttft_times.append(ttft)
|
|
||||||
|
|
||||||
def print_summary(self):
|
|
||||||
if not self.response_times:
|
|
||||||
print("No successful requests to report")
|
|
||||||
if self.errors:
|
|
||||||
print(f"Total errors: {len(self.errors)}")
|
|
||||||
print("First 5 errors:")
|
|
||||||
for error in self.errors[:5]:
|
|
||||||
print(f" {error}")
|
|
||||||
return
|
|
||||||
|
|
||||||
total_time = self.end_time - self.start_time
|
|
||||||
success_rate = (self.success_count / self.total_requests) * 100
|
|
||||||
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"BENCHMARK RESULTS")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Total time: {total_time:.2f}s")
|
|
||||||
print(f"Concurrent users: {self.concurrent_users}")
|
|
||||||
print(f"Total requests: {self.total_requests}")
|
|
||||||
print(f"Successful requests: {self.success_count}")
|
|
||||||
print(f"Failed requests: {len(self.errors)}")
|
|
||||||
print(f"Success rate: {success_rate:.1f}%")
|
|
||||||
print(f"Requests per second: {self.success_count / total_time:.2f}")
|
|
||||||
|
|
||||||
print(f"\nResponse Time Statistics:")
|
|
||||||
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
|
|
||||||
print(f" Median: {statistics.median(self.response_times):.3f}s")
|
|
||||||
print(f" Min: {min(self.response_times):.3f}s")
|
|
||||||
print(f" Max: {max(self.response_times):.3f}s")
|
|
||||||
|
|
||||||
if len(self.response_times) > 1:
|
|
||||||
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
|
|
||||||
|
|
||||||
percentiles = [50, 90, 95, 99]
|
|
||||||
sorted_times = sorted(self.response_times)
|
|
||||||
print(f"\nPercentiles:")
|
|
||||||
for p in percentiles:
|
|
||||||
idx = int(len(sorted_times) * p / 100) - 1
|
|
||||||
idx = max(0, min(idx, len(sorted_times) - 1))
|
|
||||||
print(f" P{p}: {sorted_times[idx]:.3f}s")
|
|
||||||
|
|
||||||
if self.ttft_times:
|
|
||||||
print(f"\nTime to First Token (TTFT) Statistics:")
|
|
||||||
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
|
|
||||||
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
|
|
||||||
print(f" Min: {min(self.ttft_times):.3f}s")
|
|
||||||
print(f" Max: {max(self.ttft_times):.3f}s")
|
|
||||||
|
|
||||||
if len(self.ttft_times) > 1:
|
|
||||||
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
|
|
||||||
|
|
||||||
sorted_ttft = sorted(self.ttft_times)
|
|
||||||
print(f"\nTTFT Percentiles:")
|
|
||||||
for p in percentiles:
|
|
||||||
idx = int(len(sorted_ttft) * p / 100) - 1
|
|
||||||
idx = max(0, min(idx, len(sorted_ttft) - 1))
|
|
||||||
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
|
|
||||||
|
|
||||||
if self.chunks_received:
|
|
||||||
print(f"\nStreaming Statistics:")
|
|
||||||
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
|
|
||||||
print(f" Total chunks received: {sum(self.chunks_received)}")
|
|
||||||
|
|
||||||
if self.errors:
|
|
||||||
print(f"\nErrors (showing first 5):")
|
|
||||||
for error in self.errors[:5]:
|
|
||||||
print(f" {error}")
|
|
||||||
|
|
||||||
|
|
||||||
class LlamaStackBenchmark:
|
|
||||||
def __init__(self, base_url: str, model_id: str):
|
|
||||||
self.base_url = base_url.rstrip('/')
|
|
||||||
self.model_id = model_id
|
|
||||||
self.headers = {"Content-Type": "application/json"}
|
|
||||||
self.test_messages = [
|
|
||||||
[{"role": "user", "content": "Hi"}],
|
|
||||||
[{"role": "user", "content": "What is the capital of France?"}],
|
|
||||||
[{"role": "user", "content": "Explain quantum physics in simple terms."}],
|
|
||||||
[{"role": "user", "content": "Write a short story about a robot learning to paint."}],
|
|
||||||
[
|
|
||||||
{"role": "user", "content": "What is machine learning?"},
|
|
||||||
{"role": "assistant", "content": "Machine learning is a subset of AI..."},
|
|
||||||
{"role": "user", "content": "Can you give me a practical example?"}
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
|
|
||||||
"""Make a single async streaming chat completion request."""
|
|
||||||
messages = random.choice(self.test_messages)
|
|
||||||
payload = {
|
|
||||||
"model": self.model_id,
|
|
||||||
"messages": messages,
|
|
||||||
"stream": True,
|
|
||||||
"max_tokens": 100
|
|
||||||
}
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
chunks_received = 0
|
|
||||||
ttft = None
|
|
||||||
error = None
|
|
||||||
|
|
||||||
session = aiohttp.ClientSession()
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with session.post(
|
|
||||||
f"{self.base_url}/chat/completions",
|
|
||||||
headers=self.headers,
|
|
||||||
json=payload,
|
|
||||||
timeout=aiohttp.ClientTimeout(total=30)
|
|
||||||
) as response:
|
|
||||||
if response.status == 200:
|
|
||||||
async for line in response.content:
|
|
||||||
if line:
|
|
||||||
line_str = line.decode('utf-8').strip()
|
|
||||||
if line_str.startswith('data: '):
|
|
||||||
chunks_received += 1
|
|
||||||
if ttft is None:
|
|
||||||
ttft = time.time() - start_time
|
|
||||||
if line_str == 'data: [DONE]':
|
|
||||||
break
|
|
||||||
|
|
||||||
if chunks_received == 0:
|
|
||||||
error = "No streaming chunks received"
|
|
||||||
else:
|
|
||||||
text = await response.text()
|
|
||||||
error = f"HTTP {response.status}: {text[:100]}"
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
error = f"Request error: {str(e)}"
|
|
||||||
finally:
|
|
||||||
await session.close()
|
|
||||||
|
|
||||||
response_time = time.time() - start_time
|
|
||||||
return response_time, chunks_received, ttft, error
|
|
||||||
|
|
||||||
|
|
||||||
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
|
|
||||||
"""Run benchmark using async requests for specified duration."""
|
|
||||||
stats = BenchmarkStats()
|
|
||||||
stats.concurrent_users = concurrent_users
|
|
||||||
stats.start_time = time.time()
|
|
||||||
|
|
||||||
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
|
|
||||||
print(f"Target URL: {self.base_url}/chat/completions")
|
|
||||||
print(f"Model: {self.model_id}")
|
|
||||||
|
|
||||||
connector = aiohttp.TCPConnector(limit=concurrent_users)
|
|
||||||
async with aiohttp.ClientSession(connector=connector) as session:
|
|
||||||
|
|
||||||
async def worker(worker_id: int):
|
|
||||||
"""Worker that sends requests sequentially until canceled."""
|
|
||||||
request_count = 0
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
response_time, chunks, ttft, error = await self.make_async_streaming_request()
|
|
||||||
await stats.add_result(response_time, chunks, ttft, error)
|
|
||||||
request_count += 1
|
|
||||||
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
|
|
||||||
|
|
||||||
# Progress reporting task
|
|
||||||
async def progress_reporter():
|
|
||||||
last_report_time = time.time()
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
await asyncio.sleep(1) # Report every second
|
|
||||||
if time.time() >= last_report_time + 10: # Report every 10 seconds
|
|
||||||
elapsed = time.time() - stats.start_time
|
|
||||||
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s")
|
|
||||||
last_report_time = time.time()
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Spawn concurrent workers
|
|
||||||
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
|
|
||||||
progress_task = asyncio.create_task(progress_reporter())
|
|
||||||
tasks.append(progress_task)
|
|
||||||
|
|
||||||
# Wait for duration then cancel all tasks
|
|
||||||
await asyncio.sleep(duration)
|
|
||||||
|
|
||||||
for task in tasks:
|
|
||||||
task.cancel()
|
|
||||||
|
|
||||||
# Wait for all tasks to complete
|
|
||||||
await asyncio.gather(*tasks, return_exceptions=True)
|
|
||||||
|
|
||||||
stats.end_time = time.time()
|
|
||||||
return stats
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
|
|
||||||
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
|
|
||||||
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)")
|
|
||||||
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"),
|
|
||||||
help="Model ID to use for requests")
|
|
||||||
parser.add_argument("--duration", type=int, default=60,
|
|
||||||
help="Duration in seconds to run benchmark (default: 60)")
|
|
||||||
parser.add_argument("--concurrent", type=int, default=10,
|
|
||||||
help="Number of concurrent users (default: 10)")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
benchmark = LlamaStackBenchmark(args.base_url, args.model)
|
|
||||||
|
|
||||||
try:
|
|
||||||
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
|
|
||||||
stats.print_summary()
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\nBenchmark interrupted by user")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Benchmark failed: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,190 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
"""
|
|
||||||
OpenAI-compatible mock server that returns:
|
|
||||||
- Hardcoded /models response for consistent validation
|
|
||||||
- Valid OpenAI-formatted chat completion responses with dynamic content
|
|
||||||
"""
|
|
||||||
|
|
||||||
from flask import Flask, request, jsonify, Response
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
import uuid
|
|
||||||
import json
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
# Models from environment variables
|
|
||||||
def get_models():
|
|
||||||
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
|
|
||||||
model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"object": "list",
|
|
||||||
"data": [
|
|
||||||
{
|
|
||||||
"id": model_id,
|
|
||||||
"object": "model",
|
|
||||||
"created": 1234567890,
|
|
||||||
"owned_by": "vllm"
|
|
||||||
}
|
|
||||||
for model_id in model_ids
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
def generate_random_text(length=50):
|
|
||||||
"""Generate random but coherent text for responses."""
|
|
||||||
words = [
|
|
||||||
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you",
|
|
||||||
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what",
|
|
||||||
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist",
|
|
||||||
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more"
|
|
||||||
]
|
|
||||||
return " ".join(random.choices(words, k=length))
|
|
||||||
|
|
||||||
@app.route('/v1/models', methods=['GET'])
|
|
||||||
def list_models():
|
|
||||||
models = get_models()
|
|
||||||
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
|
|
||||||
return jsonify(models)
|
|
||||||
|
|
||||||
@app.route('/v1/chat/completions', methods=['POST'])
|
|
||||||
def chat_completions():
|
|
||||||
"""Return OpenAI-formatted chat completion responses."""
|
|
||||||
data = request.get_json()
|
|
||||||
default_model = get_models()['data'][0]['id']
|
|
||||||
model = data.get('model', default_model)
|
|
||||||
messages = data.get('messages', [])
|
|
||||||
stream = data.get('stream', False)
|
|
||||||
|
|
||||||
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
|
|
||||||
|
|
||||||
if stream:
|
|
||||||
return handle_streaming_completion(model, messages)
|
|
||||||
else:
|
|
||||||
return handle_non_streaming_completion(model, messages)
|
|
||||||
|
|
||||||
def handle_non_streaming_completion(model, messages):
|
|
||||||
response_text = generate_random_text(random.randint(20, 80))
|
|
||||||
|
|
||||||
# Calculate realistic token counts
|
|
||||||
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages)
|
|
||||||
completion_tokens = len(response_text.split())
|
|
||||||
|
|
||||||
response = {
|
|
||||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
||||||
"object": "chat.completion",
|
|
||||||
"created": int(time.time()),
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"message": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": response_text
|
|
||||||
},
|
|
||||||
"finish_reason": "stop"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": prompt_tokens,
|
|
||||||
"completion_tokens": completion_tokens,
|
|
||||||
"total_tokens": prompt_tokens + completion_tokens
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return jsonify(response)
|
|
||||||
|
|
||||||
def handle_streaming_completion(model, messages):
|
|
||||||
def generate_stream():
|
|
||||||
# Generate response text
|
|
||||||
full_response = generate_random_text(random.randint(30, 100))
|
|
||||||
words = full_response.split()
|
|
||||||
|
|
||||||
# Send initial chunk
|
|
||||||
initial_chunk = {
|
|
||||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": int(time.time()),
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"delta": {"role": "assistant", "content": ""}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
|
||||||
|
|
||||||
# Send word by word
|
|
||||||
for i, word in enumerate(words):
|
|
||||||
chunk = {
|
|
||||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": int(time.time()),
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
yield f"data: {json.dumps(chunk)}\n\n"
|
|
||||||
# Configurable delay to simulate realistic streaming
|
|
||||||
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
|
|
||||||
time.sleep(stream_delay)
|
|
||||||
|
|
||||||
# Send final chunk
|
|
||||||
final_chunk = {
|
|
||||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
||||||
"object": "chat.completion.chunk",
|
|
||||||
"created": int(time.time()),
|
|
||||||
"model": model,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"delta": {"content": ""},
|
|
||||||
"finish_reason": "stop"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
yield f"data: {json.dumps(final_chunk)}\n\n"
|
|
||||||
yield "data: [DONE]\n\n"
|
|
||||||
|
|
||||||
return Response(
|
|
||||||
generate_stream(),
|
|
||||||
mimetype='text/event-stream',
|
|
||||||
headers={
|
|
||||||
'Cache-Control': 'no-cache',
|
|
||||||
'Connection': 'keep-alive',
|
|
||||||
'Access-Control-Allow-Origin': '*',
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
@app.route('/health', methods=['GET'])
|
|
||||||
def health():
|
|
||||||
return jsonify({"status": "healthy", "type": "openai-mock"})
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server')
|
|
||||||
parser.add_argument('--port', type=int, default=8081,
|
|
||||||
help='Port to run the server on (default: 8081)')
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
port = args.port
|
|
||||||
|
|
||||||
models = get_models()
|
|
||||||
print("Starting OpenAI-compatible mock server...")
|
|
||||||
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
|
|
||||||
print("- OpenAI-formatted chat/completion responses with dynamic content")
|
|
||||||
print("- Streaming support with valid SSE format")
|
|
||||||
print(f"- Listening on: http://0.0.0.0:{port}")
|
|
||||||
app.run(host='0.0.0.0', port=port, debug=False)
|
|
|
@ -1,52 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
# Script to profile an already running Llama Stack server
|
|
||||||
# Usage: ./profile_running_server.sh [duration_seconds] [output_file]
|
|
||||||
|
|
||||||
DURATION=${1:-60} # Default 60 seconds
|
|
||||||
OUTPUT_FILE=${2:-"llama_stack_profile"} # Default output file
|
|
||||||
|
|
||||||
echo "Looking for running Llama Stack server..."
|
|
||||||
|
|
||||||
# Find the server PID
|
|
||||||
SERVER_PID=$(ps aux | grep "llama_stack.core.server.server" | grep -v grep | awk '{print $2}' | head -1)
|
|
||||||
|
|
||||||
|
|
||||||
if [ -z "$SERVER_PID" ]; then
|
|
||||||
echo "Error: No running Llama Stack server found"
|
|
||||||
echo "Please start your server first with:"
|
|
||||||
echo "LLAMA_STACK_LOGGING=\"all=ERROR\" MOCK_INFERENCE_URL=http://localhost:8080 SAFETY_MODEL=llama-guard3:1b uv run --with llama-stack python -m llama_stack.core.server.server docs/source/distributions/k8s-benchmark/stack_run_config.yaml"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Found Llama Stack server with PID: $SERVER_PID"
|
|
||||||
|
|
||||||
# Start py-spy profiling
|
|
||||||
echo "Starting py-spy profiling for ${DURATION} seconds..."
|
|
||||||
echo "Output will be saved to: ${OUTPUT_FILE}.svg"
|
|
||||||
echo ""
|
|
||||||
echo "You can now run your load test..."
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Get the full path to py-spy
|
|
||||||
PYSPY_PATH=$(which py-spy)
|
|
||||||
|
|
||||||
# Check if running as root, if not, use sudo
|
|
||||||
if [ "$EUID" -ne 0 ]; then
|
|
||||||
echo "py-spy requires root permissions on macOS. Running with sudo..."
|
|
||||||
sudo "$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
|
||||||
else
|
|
||||||
"$PYSPY_PATH" record -o "${OUTPUT_FILE}.svg" -d ${DURATION} -p $SERVER_PID
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "Profiling completed! Results saved to: ${OUTPUT_FILE}.svg"
|
|
||||||
echo ""
|
|
||||||
echo "To view the flame graph:"
|
|
||||||
echo "open ${OUTPUT_FILE}.svg"
|
|
|
@ -1,148 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
# Default values
|
|
||||||
TARGET="stack"
|
|
||||||
DURATION=60
|
|
||||||
CONCURRENT=10
|
|
||||||
|
|
||||||
# Parse command line arguments
|
|
||||||
usage() {
|
|
||||||
echo "Usage: $0 [options]"
|
|
||||||
echo "Options:"
|
|
||||||
echo " -t, --target <stack|vllm> Target to benchmark (default: stack)"
|
|
||||||
echo " -d, --duration <seconds> Duration in seconds (default: 60)"
|
|
||||||
echo " -c, --concurrent <users> Number of concurrent users (default: 10)"
|
|
||||||
echo " -h, --help Show this help message"
|
|
||||||
echo ""
|
|
||||||
echo "Examples:"
|
|
||||||
echo " $0 --target vllm # Benchmark vLLM direct"
|
|
||||||
echo " $0 --target stack # Benchmark Llama Stack (default)"
|
|
||||||
echo " $0 -t vllm -d 120 -c 20 # vLLM with 120s duration, 20 users"
|
|
||||||
}
|
|
||||||
|
|
||||||
while [[ $# -gt 0 ]]; do
|
|
||||||
case $1 in
|
|
||||||
-t|--target)
|
|
||||||
TARGET="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
-d|--duration)
|
|
||||||
DURATION="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
-c|--concurrent)
|
|
||||||
CONCURRENT="$2"
|
|
||||||
shift 2
|
|
||||||
;;
|
|
||||||
-h|--help)
|
|
||||||
usage
|
|
||||||
exit 0
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unknown option: $1"
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# Validate target
|
|
||||||
if [[ "$TARGET" != "stack" && "$TARGET" != "vllm" ]]; then
|
|
||||||
echo "Error: Target must be 'stack' or 'vllm'"
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Set configuration based on target
|
|
||||||
if [[ "$TARGET" == "vllm" ]]; then
|
|
||||||
BASE_URL="http://vllm-server:8000/v1"
|
|
||||||
JOB_NAME="vllm-benchmark-job"
|
|
||||||
echo "Benchmarking vLLM direct..."
|
|
||||||
else
|
|
||||||
BASE_URL="http://llama-stack-benchmark-service:8323/v1/openai/v1"
|
|
||||||
JOB_NAME="stack-benchmark-job"
|
|
||||||
echo "Benchmarking Llama Stack..."
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Configuration:"
|
|
||||||
echo " Target: $TARGET"
|
|
||||||
echo " Base URL: $BASE_URL"
|
|
||||||
echo " Duration: ${DURATION}s"
|
|
||||||
echo " Concurrent users: $CONCURRENT"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Create temporary job yaml
|
|
||||||
TEMP_YAML="/tmp/benchmark-job-temp-$(date +%s).yaml"
|
|
||||||
cat > "$TEMP_YAML" << EOF
|
|
||||||
apiVersion: batch/v1
|
|
||||||
kind: Job
|
|
||||||
metadata:
|
|
||||||
name: $JOB_NAME
|
|
||||||
namespace: default
|
|
||||||
spec:
|
|
||||||
template:
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: benchmark
|
|
||||||
image: python:3.11-slim
|
|
||||||
command: ["/bin/bash"]
|
|
||||||
args:
|
|
||||||
- "-c"
|
|
||||||
- |
|
|
||||||
pip install aiohttp &&
|
|
||||||
python3 /benchmark/benchmark.py \\
|
|
||||||
--base-url $BASE_URL \\
|
|
||||||
--model \${INFERENCE_MODEL} \\
|
|
||||||
--duration $DURATION \\
|
|
||||||
--concurrent $CONCURRENT
|
|
||||||
env:
|
|
||||||
- name: INFERENCE_MODEL
|
|
||||||
value: "meta-llama/Llama-3.2-3B-Instruct"
|
|
||||||
volumeMounts:
|
|
||||||
- name: benchmark-script
|
|
||||||
mountPath: /benchmark
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
memory: "256Mi"
|
|
||||||
cpu: "250m"
|
|
||||||
limits:
|
|
||||||
memory: "512Mi"
|
|
||||||
cpu: "500m"
|
|
||||||
volumes:
|
|
||||||
- name: benchmark-script
|
|
||||||
configMap:
|
|
||||||
name: benchmark-script
|
|
||||||
restartPolicy: Never
|
|
||||||
backoffLimit: 3
|
|
||||||
EOF
|
|
||||||
|
|
||||||
echo "Creating benchmark ConfigMap..."
|
|
||||||
kubectl create configmap benchmark-script \
|
|
||||||
--from-file=benchmark.py=benchmark.py \
|
|
||||||
--dry-run=client -o yaml | kubectl apply -f -
|
|
||||||
|
|
||||||
echo "Cleaning up any existing benchmark job..."
|
|
||||||
kubectl delete job $JOB_NAME 2>/dev/null || true
|
|
||||||
|
|
||||||
echo "Deploying benchmark Job..."
|
|
||||||
kubectl apply -f "$TEMP_YAML"
|
|
||||||
|
|
||||||
echo "Waiting for job to start..."
|
|
||||||
kubectl wait --for=condition=Ready pod -l job-name=$JOB_NAME --timeout=60s
|
|
||||||
|
|
||||||
echo "Following benchmark logs..."
|
|
||||||
kubectl logs -f job/$JOB_NAME
|
|
||||||
|
|
||||||
echo "Job completed. Checking final status..."
|
|
||||||
kubectl get job $JOB_NAME
|
|
||||||
|
|
||||||
# Clean up temporary file
|
|
||||||
rm -f "$TEMP_YAML"
|
|
|
@ -1,133 +0,0 @@
|
||||||
apiVersion: v1
|
|
||||||
data:
|
|
||||||
stack_run_config.yaml: |
|
|
||||||
version: '2'
|
|
||||||
image_name: kubernetes-benchmark-demo
|
|
||||||
apis:
|
|
||||||
- agents
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
- telemetry
|
|
||||||
- tool_runtime
|
|
||||||
- vector_io
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: vllm-inference
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: vllm-safety
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: sentence-transformers
|
|
||||||
provider_type: inline::sentence-transformers
|
|
||||||
config: {}
|
|
||||||
vector_io:
|
|
||||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
|
||||||
provider_type: remote::chromadb
|
|
||||||
config:
|
|
||||||
url: ${env.CHROMADB_URL:=}
|
|
||||||
kvstore:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
safety:
|
|
||||||
- provider_id: llama-guard
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
excluded_categories: []
|
|
||||||
agents:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
responses_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
|
||||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
|
||||||
tool_runtime:
|
|
||||||
- provider_id: brave-search
|
|
||||||
provider_type: remote::brave-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: tavily-search
|
|
||||||
provider_type: remote::tavily-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: rag-runtime
|
|
||||||
provider_type: inline::rag-runtime
|
|
||||||
config: {}
|
|
||||||
- provider_id: model-context-protocol
|
|
||||||
provider_type: remote::model-context-protocol
|
|
||||||
config: {}
|
|
||||||
metadata_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
table_name: llamastack_kvstore
|
|
||||||
inference_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
models:
|
|
||||||
- metadata:
|
|
||||||
embedding_dimension: 384
|
|
||||||
model_id: all-MiniLM-L6-v2
|
|
||||||
provider_id: sentence-transformers
|
|
||||||
model_type: embedding
|
|
||||||
- model_id: ${env.INFERENCE_MODEL}
|
|
||||||
provider_id: vllm-inference
|
|
||||||
model_type: llm
|
|
||||||
- model_id: ${env.SAFETY_MODEL}
|
|
||||||
provider_id: vllm-safety
|
|
||||||
model_type: llm
|
|
||||||
shields:
|
|
||||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
|
||||||
vector_dbs: []
|
|
||||||
datasets: []
|
|
||||||
scoring_fns: []
|
|
||||||
benchmarks: []
|
|
||||||
tool_groups:
|
|
||||||
- toolgroup_id: builtin::websearch
|
|
||||||
provider_id: tavily-search
|
|
||||||
- toolgroup_id: builtin::rag
|
|
||||||
provider_id: rag-runtime
|
|
||||||
server:
|
|
||||||
port: 8323
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
creationTimestamp: null
|
|
||||||
name: llama-stack-config
|
|
|
@ -1,83 +0,0 @@
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: llama-benchmark-pvc
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: 1Gi
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: llama-stack-benchmark-server
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
app.kubernetes.io/name: llama-stack-benchmark
|
|
||||||
app.kubernetes.io/component: server
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/name: llama-stack-benchmark
|
|
||||||
app.kubernetes.io/component: server
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: llama-stack-benchmark
|
|
||||||
image: llamastack/distribution-starter:latest
|
|
||||||
imagePullPolicy: Always # since we have specified latest instead of a version
|
|
||||||
env:
|
|
||||||
- name: ENABLE_CHROMADB
|
|
||||||
value: "true"
|
|
||||||
- name: CHROMADB_URL
|
|
||||||
value: http://chromadb.default.svc.cluster.local:6000
|
|
||||||
- name: POSTGRES_HOST
|
|
||||||
value: postgres-server.default.svc.cluster.local
|
|
||||||
- name: POSTGRES_PORT
|
|
||||||
value: "5432"
|
|
||||||
- name: INFERENCE_MODEL
|
|
||||||
value: "${INFERENCE_MODEL}"
|
|
||||||
- name: SAFETY_MODEL
|
|
||||||
value: "${SAFETY_MODEL}"
|
|
||||||
- name: TAVILY_SEARCH_API_KEY
|
|
||||||
value: "${TAVILY_SEARCH_API_KEY}"
|
|
||||||
- name: VLLM_URL
|
|
||||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
|
||||||
- name: VLLM_MAX_TOKENS
|
|
||||||
value: "3072"
|
|
||||||
- name: VLLM_SAFETY_URL
|
|
||||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
|
||||||
- name: VLLM_TLS_VERIFY
|
|
||||||
value: "false"
|
|
||||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8323"]
|
|
||||||
ports:
|
|
||||||
- containerPort: 8323
|
|
||||||
volumeMounts:
|
|
||||||
- name: llama-storage
|
|
||||||
mountPath: /root/.llama
|
|
||||||
- name: llama-config
|
|
||||||
mountPath: /etc/config
|
|
||||||
volumes:
|
|
||||||
- name: llama-storage
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: llama-benchmark-pvc
|
|
||||||
- name: llama-config
|
|
||||||
configMap:
|
|
||||||
name: llama-stack-config
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: llama-stack-benchmark-service
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
app.kubernetes.io/name: llama-stack-benchmark
|
|
||||||
app.kubernetes.io/component: server
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 8323
|
|
||||||
targetPort: 8323
|
|
||||||
type: ClusterIP
|
|
|
@ -1,108 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: kubernetes-benchmark-demo
|
|
||||||
apis:
|
|
||||||
- agents
|
|
||||||
- inference
|
|
||||||
- telemetry
|
|
||||||
- tool_runtime
|
|
||||||
- vector_io
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: vllm-inference
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: sentence-transformers
|
|
||||||
provider_type: inline::sentence-transformers
|
|
||||||
config: {}
|
|
||||||
vector_io:
|
|
||||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
|
||||||
provider_type: remote::chromadb
|
|
||||||
config:
|
|
||||||
url: ${env.CHROMADB_URL:=}
|
|
||||||
kvstore:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
agents:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
responses_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
|
||||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
|
||||||
tool_runtime:
|
|
||||||
- provider_id: brave-search
|
|
||||||
provider_type: remote::brave-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: tavily-search
|
|
||||||
provider_type: remote::tavily-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: rag-runtime
|
|
||||||
provider_type: inline::rag-runtime
|
|
||||||
config: {}
|
|
||||||
- provider_id: model-context-protocol
|
|
||||||
provider_type: remote::model-context-protocol
|
|
||||||
config: {}
|
|
||||||
metadata_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
table_name: llamastack_kvstore
|
|
||||||
inference_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
models:
|
|
||||||
- metadata:
|
|
||||||
embedding_dimension: 384
|
|
||||||
model_id: all-MiniLM-L6-v2
|
|
||||||
provider_id: sentence-transformers
|
|
||||||
model_type: embedding
|
|
||||||
- model_id: ${env.INFERENCE_MODEL}
|
|
||||||
provider_id: vllm-inference
|
|
||||||
model_type: llm
|
|
||||||
vector_dbs: []
|
|
||||||
datasets: []
|
|
||||||
scoring_fns: []
|
|
||||||
benchmarks: []
|
|
||||||
tool_groups:
|
|
||||||
- toolgroup_id: builtin::websearch
|
|
||||||
provider_id: tavily-search
|
|
||||||
- toolgroup_id: builtin::rag
|
|
||||||
provider_id: rag-runtime
|
|
||||||
server:
|
|
||||||
port: 8323
|
|
|
@ -34,13 +34,6 @@ data:
|
||||||
provider_type: remote::chromadb
|
provider_type: remote::chromadb
|
||||||
config:
|
config:
|
||||||
url: ${env.CHROMADB_URL:=}
|
url: ${env.CHROMADB_URL:=}
|
||||||
kvstore:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
safety:
|
safety:
|
||||||
- provider_id: llama-guard
|
- provider_id: llama-guard
|
||||||
provider_type: inline::llama-guard
|
provider_type: inline::llama-guard
|
||||||
|
|
|
@ -40,19 +40,19 @@ spec:
|
||||||
value: "3072"
|
value: "3072"
|
||||||
- name: VLLM_SAFETY_URL
|
- name: VLLM_SAFETY_URL
|
||||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||||
- name: VLLM_TLS_VERIFY
|
|
||||||
value: "false"
|
|
||||||
- name: POSTGRES_HOST
|
- name: POSTGRES_HOST
|
||||||
value: postgres-server.default.svc.cluster.local
|
value: postgres-server.default.svc.cluster.local
|
||||||
- name: POSTGRES_PORT
|
- name: POSTGRES_PORT
|
||||||
value: "5432"
|
value: "5432"
|
||||||
|
- name: VLLM_TLS_VERIFY
|
||||||
|
value: "false"
|
||||||
- name: INFERENCE_MODEL
|
- name: INFERENCE_MODEL
|
||||||
value: "${INFERENCE_MODEL}"
|
value: "${INFERENCE_MODEL}"
|
||||||
- name: SAFETY_MODEL
|
- name: SAFETY_MODEL
|
||||||
value: "${SAFETY_MODEL}"
|
value: "${SAFETY_MODEL}"
|
||||||
- name: TAVILY_SEARCH_API_KEY
|
- name: TAVILY_SEARCH_API_KEY
|
||||||
value: "${TAVILY_SEARCH_API_KEY}"
|
value: "${TAVILY_SEARCH_API_KEY}"
|
||||||
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
|
command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8321
|
- containerPort: 8321
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
|
|
|
@ -31,13 +31,6 @@ providers:
|
||||||
provider_type: remote::chromadb
|
provider_type: remote::chromadb
|
||||||
config:
|
config:
|
||||||
url: ${env.CHROMADB_URL:=}
|
url: ${env.CHROMADB_URL:=}
|
||||||
kvstore:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
safety:
|
safety:
|
||||||
- provider_id: llama-guard
|
- provider_id: llama-guard
|
||||||
provider_type: inline::llama-guard
|
provider_type: inline::llama-guard
|
||||||
|
|
|
@ -56,12 +56,12 @@ Breaking down the demo app, this section will show the core pieces that are used
|
||||||
### Setup Remote Inferencing
|
### Setup Remote Inferencing
|
||||||
Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
|
Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
|
||||||
```
|
```
|
||||||
uv venv starter --python 3.12
|
conda create -n stack-fireworks python=3.10
|
||||||
source starter/bin/activate # On Windows: starter\Scripts\activate
|
conda activate stack-fireworks
|
||||||
pip install --no-cache llama-stack==0.2.2
|
pip install --no-cache llama-stack==0.2.2
|
||||||
llama stack build --distro starter --image-type venv
|
llama stack build --template fireworks --image-type conda
|
||||||
export FIREWORKS_API_KEY=<SOME_KEY>
|
export FIREWORKS_API_KEY=<SOME_KEY>
|
||||||
llama stack run starter --port 5050
|
llama stack run fireworks --port 5050
|
||||||
```
|
```
|
||||||
|
|
||||||
Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.
|
Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.
|
||||||
|
|
|
@ -57,7 +57,7 @@ Make sure you have access to a watsonx API Key. You can get one by referring [wa
|
||||||
|
|
||||||
## Running Llama Stack with watsonx
|
## Running Llama Stack with watsonx
|
||||||
|
|
||||||
You can do this via venv or Docker which has a pre-built image.
|
You can do this via Conda (build code), venv or Docker which has a pre-built image.
|
||||||
|
|
||||||
### Via Docker
|
### Via Docker
|
||||||
|
|
||||||
|
@ -76,3 +76,13 @@ docker run \
|
||||||
--env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
|
--env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
|
||||||
--env WATSONX_BASE_URL=$WATSONX_BASE_URL
|
--env WATSONX_BASE_URL=$WATSONX_BASE_URL
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Via Conda
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama stack build --template watsonx --image-type conda
|
||||||
|
llama stack run ./run.yaml \
|
||||||
|
--port $LLAMA_STACK_PORT \
|
||||||
|
--env WATSONX_API_KEY=$WATSONX_API_KEY \
|
||||||
|
--env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID
|
||||||
|
```
|
||||||
|
|
|
@ -114,7 +114,7 @@ podman run --rm -it \
|
||||||
|
|
||||||
## Running Llama Stack
|
## Running Llama Stack
|
||||||
|
|
||||||
Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via venv or Docker which has a pre-built image.
|
Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
|
|
||||||
### Via Docker
|
### Via Docker
|
||||||
|
|
||||||
|
@ -153,7 +153,7 @@ docker run \
|
||||||
--pull always \
|
--pull always \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v $HOME/.llama:/root/.llama \
|
-v $HOME/.llama:/root/.llama \
|
||||||
-v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-run.yaml \
|
-v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
|
||||||
llamastack/distribution-dell \
|
llamastack/distribution-dell \
|
||||||
--config /root/my-run.yaml \
|
--config /root/my-run.yaml \
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
|
@ -164,12 +164,12 @@ docker run \
|
||||||
--env CHROMA_URL=$CHROMA_URL
|
--env CHROMA_URL=$CHROMA_URL
|
||||||
```
|
```
|
||||||
|
|
||||||
### Via venv
|
### Via Conda
|
||||||
|
|
||||||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --distro dell --image-type venv
|
llama stack build --template dell --image-type conda
|
||||||
llama stack run dell
|
llama stack run dell
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
|
|
|
@ -70,7 +70,7 @@ $ llama model list --downloaded
|
||||||
|
|
||||||
## Running the Distribution
|
## Running the Distribution
|
||||||
|
|
||||||
You can do this via venv or Docker which has a pre-built image.
|
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
|
|
||||||
### Via Docker
|
### Via Docker
|
||||||
|
|
||||||
|
@ -104,12 +104,12 @@ docker run \
|
||||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
```
|
```
|
||||||
|
|
||||||
### Via venv
|
### Via Conda
|
||||||
|
|
||||||
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
|
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --distro meta-reference-gpu --image-type venv
|
llama stack build --template meta-reference-gpu --image-type conda
|
||||||
llama stack run distributions/meta-reference-gpu/run.yaml \
|
llama stack run distributions/meta-reference-gpu/run.yaml \
|
||||||
--port 8321 \
|
--port 8321 \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
|
|
|
@ -133,7 +133,7 @@ curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-inst
|
||||||
|
|
||||||
## Running Llama Stack with NVIDIA
|
## Running Llama Stack with NVIDIA
|
||||||
|
|
||||||
You can do this via venv (build code), or Docker which has a pre-built image.
|
You can do this via Conda or venv (build code), or Docker which has a pre-built image.
|
||||||
|
|
||||||
### Via Docker
|
### Via Docker
|
||||||
|
|
||||||
|
@ -152,13 +152,24 @@ docker run \
|
||||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Via Conda
|
||||||
|
|
||||||
|
```bash
|
||||||
|
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||||
|
llama stack build --template nvidia --image-type conda
|
||||||
|
llama stack run ./run.yaml \
|
||||||
|
--port 8321 \
|
||||||
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||||
|
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
|
```
|
||||||
|
|
||||||
### Via venv
|
### Via venv
|
||||||
|
|
||||||
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||||
llama stack build --distro nvidia --image-type venv
|
llama stack build --template nvidia --image-type venv
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 8321 \
|
--port 8321 \
|
||||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||||
|
|
|
@ -100,6 +100,10 @@ The following environment variables can be configured:
|
||||||
### Model Configuration
|
### Model Configuration
|
||||||
- `INFERENCE_MODEL`: HuggingFace model for serverless inference
|
- `INFERENCE_MODEL`: HuggingFace model for serverless inference
|
||||||
- `INFERENCE_ENDPOINT_NAME`: HuggingFace endpoint name
|
- `INFERENCE_ENDPOINT_NAME`: HuggingFace endpoint name
|
||||||
|
- `OLLAMA_INFERENCE_MODEL`: Ollama model name
|
||||||
|
- `OLLAMA_EMBEDDING_MODEL`: Ollama embedding model name
|
||||||
|
- `OLLAMA_EMBEDDING_DIMENSION`: Ollama embedding dimension (default: `384`)
|
||||||
|
- `VLLM_INFERENCE_MODEL`: vLLM model name
|
||||||
|
|
||||||
### Vector Database Configuration
|
### Vector Database Configuration
|
||||||
- `SQLITE_STORE_DIR`: SQLite store directory (default: `~/.llama/distributions/starter`)
|
- `SQLITE_STORE_DIR`: SQLite store directory (default: `~/.llama/distributions/starter`)
|
||||||
|
@ -123,29 +127,47 @@ The following environment variables can be configured:
|
||||||
|
|
||||||
## Enabling Providers
|
## Enabling Providers
|
||||||
|
|
||||||
You can enable specific providers by setting appropriate environment variables. For example,
|
You can enable specific providers by setting their provider ID to a valid value using environment variables. This is useful when you want to use certain providers or don't have the required API keys.
|
||||||
|
|
||||||
|
### Examples of Enabling Providers
|
||||||
|
|
||||||
|
#### Enable FAISS Vector Provider
|
||||||
```bash
|
```bash
|
||||||
# self-hosted
|
export ENABLE_FAISS=faiss
|
||||||
export OLLAMA_URL=http://localhost:11434 # enables the Ollama inference provider
|
|
||||||
export VLLM_URL=http://localhost:8000/v1 # enables the vLLM inference provider
|
|
||||||
export TGI_URL=http://localhost:8000/v1 # enables the TGI inference provider
|
|
||||||
|
|
||||||
# cloud-hosted requiring API key configuration on the server
|
|
||||||
export CEREBRAS_API_KEY=your_cerebras_api_key # enables the Cerebras inference provider
|
|
||||||
export NVIDIA_API_KEY=your_nvidia_api_key # enables the NVIDIA inference provider
|
|
||||||
|
|
||||||
# vector providers
|
|
||||||
export MILVUS_URL=http://localhost:19530 # enables the Milvus vector provider
|
|
||||||
export CHROMADB_URL=http://localhost:8000/v1 # enables the ChromaDB vector provider
|
|
||||||
export PGVECTOR_DB=llama_stack_db # enables the PGVector vector provider
|
|
||||||
```
|
```
|
||||||
|
|
||||||
This distribution comes with a default "llama-guard" shield that can be enabled by setting the `SAFETY_MODEL` environment variable to point to an appropriate Llama Guard model id. Use `llama-stack-client models list` to see the list of available models.
|
#### Enable Ollama Models
|
||||||
|
```bash
|
||||||
|
export ENABLE_OLLAMA=ollama
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Disable vLLM Models
|
||||||
|
```bash
|
||||||
|
export VLLM_INFERENCE_MODEL=__disabled__
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Disable Optional Vector Providers
|
||||||
|
```bash
|
||||||
|
export ENABLE_SQLITE_VEC=__disabled__
|
||||||
|
export ENABLE_CHROMADB=__disabled__
|
||||||
|
export ENABLE_PGVECTOR=__disabled__
|
||||||
|
```
|
||||||
|
|
||||||
|
### Provider ID Patterns
|
||||||
|
|
||||||
|
The starter distribution uses several patterns for provider IDs:
|
||||||
|
|
||||||
|
1. **Direct provider IDs**: `faiss`, `ollama`, `vllm`
|
||||||
|
2. **Environment-based provider IDs**: `${env.ENABLE_SQLITE_VEC:+sqlite-vec}`
|
||||||
|
3. **Model-based provider IDs**: `${env.OLLAMA_INFERENCE_MODEL:__disabled__}`
|
||||||
|
|
||||||
|
When using the `+` pattern (like `${env.ENABLE_SQLITE_VEC+sqlite-vec}`), the provider is enabled by default and can be disabled by setting the environment variable to `__disabled__`.
|
||||||
|
|
||||||
|
When using the `:` pattern (like `${env.OLLAMA_INFERENCE_MODEL:__disabled__}`), the provider is disabled by default and can be enabled by setting the environment variable to a valid value.
|
||||||
|
|
||||||
## Running the Distribution
|
## Running the Distribution
|
||||||
|
|
||||||
You can run the starter distribution via Docker or venv.
|
You can run the starter distribution via Docker, Conda, or venv.
|
||||||
|
|
||||||
### Via Docker
|
### Via Docker
|
||||||
|
|
||||||
|
@ -164,12 +186,12 @@ docker run \
|
||||||
--port $LLAMA_STACK_PORT
|
--port $LLAMA_STACK_PORT
|
||||||
```
|
```
|
||||||
|
|
||||||
### Via venv
|
### Via Conda or venv
|
||||||
|
|
||||||
Ensure you have configured the starter distribution using the environment variables explained above.
|
Ensure you have configured the starter distribution using the environment variables explained above.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv run --with llama-stack llama stack build --distro starter --image-type venv --run
|
uv run --with llama-stack llama stack build --template starter --image-type <conda|venv> --run
|
||||||
```
|
```
|
||||||
|
|
||||||
## Example Usage
|
## Example Usage
|
||||||
|
|
|
@ -11,6 +11,12 @@ This is the simplest way to get started. Using Llama Stack as a library means yo
|
||||||
|
|
||||||
Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
|
Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
|
||||||
|
|
||||||
|
|
||||||
|
## Conda:
|
||||||
|
|
||||||
|
If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
|
||||||
|
|
||||||
|
|
||||||
## Kubernetes:
|
## Kubernetes:
|
||||||
|
|
||||||
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
|
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
|
||||||
|
|
|
@ -52,16 +52,11 @@ agent = Agent(
|
||||||
prompt = "How do you do great work?"
|
prompt = "How do you do great work?"
|
||||||
print("prompt>", prompt)
|
print("prompt>", prompt)
|
||||||
|
|
||||||
use_stream = True
|
|
||||||
response = agent.create_turn(
|
response = agent.create_turn(
|
||||||
messages=[{"role": "user", "content": prompt}],
|
messages=[{"role": "user", "content": prompt}],
|
||||||
session_id=agent.create_session("rag_session"),
|
session_id=agent.create_session("rag_session"),
|
||||||
stream=use_stream,
|
stream=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Only call `AgentEventLogger().log(response)` for streaming responses.
|
for log in AgentEventLogger().log(response):
|
||||||
if use_stream:
|
|
||||||
for log in AgentEventLogger().log(response):
|
|
||||||
log.print()
|
log.print()
|
||||||
else:
|
|
||||||
print(response)
|
|
||||||
|
|
|
@ -59,10 +59,10 @@ Now let's build and run the Llama Stack config for Ollama.
|
||||||
We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.
|
We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --distro starter --image-type venv --run
|
llama stack build --template starter --image-type venv --run
|
||||||
```
|
```
|
||||||
:::
|
:::
|
||||||
:::{tab-item} Using `venv`
|
:::{tab-item} Using `conda`
|
||||||
You can use Python to build and run the Llama Stack server, which is useful for testing and development.
|
You can use Python to build and run the Llama Stack server, which is useful for testing and development.
|
||||||
|
|
||||||
Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
|
Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
|
||||||
|
@ -70,7 +70,7 @@ which defines the providers and their settings.
|
||||||
Now let's build and run the Llama Stack config for Ollama.
|
Now let's build and run the Llama Stack config for Ollama.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --distro starter --image-type venv --run
|
llama stack build --template starter --image-type conda --run
|
||||||
```
|
```
|
||||||
:::
|
:::
|
||||||
:::{tab-item} Using a Container
|
:::{tab-item} Using a Container
|
||||||
|
@ -150,7 +150,13 @@ pip install llama-stack-client
|
||||||
```
|
```
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
:::{tab-item} Install with `conda`
|
||||||
|
```bash
|
||||||
|
yes | conda create -n stack-client python=3.12
|
||||||
|
conda activate stack-client
|
||||||
|
pip install llama-stack-client
|
||||||
|
```
|
||||||
|
:::
|
||||||
::::
|
::::
|
||||||
|
|
||||||
Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference.md) to check the
|
Now let's use the `llama-stack-client` [CLI](../references/llama_stack_client_cli_reference.md) to check the
|
||||||
|
|
|
@ -16,13 +16,10 @@ as the inference [provider](../providers/inference/index) for a Llama Model.
|
||||||
```bash
|
```bash
|
||||||
ollama run llama3.2:3b --keepalive 60m
|
ollama run llama3.2:3b --keepalive 60m
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Step 2: Run the Llama Stack server
|
#### Step 2: Run the Llama Stack server
|
||||||
|
|
||||||
We will use `uv` to run the Llama Stack server.
|
We will use `uv` to run the Llama Stack server.
|
||||||
```bash
|
```bash
|
||||||
OLLAMA_URL=http://localhost:11434 \
|
uv run --with llama-stack llama stack build --template starter --image-type venv --run
|
||||||
uv run --with llama-stack llama stack build --distro starter --image-type venv --run
|
|
||||||
```
|
```
|
||||||
#### Step 3: Run the demo
|
#### Step 3: Run the demo
|
||||||
Now open up a new terminal and copy the following script into a file named `demo_script.py`.
|
Now open up a new terminal and copy the following script into a file named `demo_script.py`.
|
||||||
|
|
|
@ -1,22 +1,5 @@
|
||||||
# Agents
|
# Agents Providers
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
Agents API for creating and interacting with agentic systems.
|
|
||||||
|
|
||||||
Main functionalities provided by this API:
|
|
||||||
- Create agents with specific instructions and ability to use tools.
|
|
||||||
- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
|
|
||||||
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
|
|
||||||
- Agents can be provided with various shields (see the Safety API for more details).
|
|
||||||
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
|
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **agents** API.
|
This section contains documentation for all available providers for the **agents** API.
|
||||||
|
|
||||||
## Providers
|
- [inline::meta-reference](inline_meta-reference.md)
|
||||||
|
|
||||||
```{toctree}
|
|
||||||
:maxdepth: 1
|
|
||||||
|
|
||||||
inline_meta-reference
|
|
||||||
```
|
|
|
@ -1,21 +0,0 @@
|
||||||
# Batches
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
Protocol for batch processing API operations.
|
|
||||||
|
|
||||||
The Batches API enables efficient processing of multiple requests in a single operation,
|
|
||||||
particularly useful for processing large datasets, batch evaluation workflows, and
|
|
||||||
cost-effective inference at scale.
|
|
||||||
|
|
||||||
Note: This API is currently under active development and may undergo changes.
|
|
||||||
|
|
||||||
This section contains documentation for all available providers for the **batches** API.
|
|
||||||
|
|
||||||
## Providers
|
|
||||||
|
|
||||||
```{toctree}
|
|
||||||
:maxdepth: 1
|
|
||||||
|
|
||||||
inline_reference
|
|
||||||
```
|
|
|
@ -1,23 +0,0 @@
|
||||||
# inline::reference
|
|
||||||
|
|
||||||
## Description
|
|
||||||
|
|
||||||
Reference implementation of batches API with KVStore persistence.
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
| Field | Type | Required | Default | Description |
|
|
||||||
|-------|------|----------|---------|-------------|
|
|
||||||
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. |
|
|
||||||
| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
|
|
||||||
| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
|
|
||||||
|
|
||||||
## Sample Configuration
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
kvstore:
|
|
||||||
type: sqlite
|
|
||||||
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue