Merge branch 'main' into fix/vector-db-mandatory-provider-id

This commit is contained in:
Habeb Nawatha 2025-09-11 12:02:37 +03:00 committed by GitHub
commit 4374da02f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
243 changed files with 21774 additions and 17408 deletions

View file

@ -2,26 +2,28 @@ name: 'Run and Record Tests'
description: 'Run integration tests and handle recording/artifact upload' description: 'Run integration tests and handle recording/artifact upload'
inputs: inputs:
test-subdirs:
description: 'Comma-separated list of test subdirectories to run'
required: true
test-pattern:
description: 'Regex pattern to pass to pytest -k'
required: false
default: ''
stack-config: stack-config:
description: 'Stack configuration to use' description: 'Stack configuration to use'
required: true required: true
provider: setup:
description: 'Provider to use for tests' description: 'Setup to use for tests (e.g., ollama, gpt, vllm)'
required: true required: false
default: ''
inference-mode: inference-mode:
description: 'Inference mode (record or replay)' description: 'Inference mode (record or replay)'
required: true required: true
run-vision-tests: suite:
description: 'Whether to run vision tests' description: 'Test suite to use: base, responses, vision, etc.'
required: false required: false
default: 'false' default: ''
subdirs:
description: 'Comma-separated list of test subdirectories to run; overrides suite'
required: false
default: ''
pattern:
description: 'Regex pattern to pass to pytest -k'
required: false
default: ''
runs: runs:
using: 'composite' using: 'composite'
@ -36,14 +38,23 @@ runs:
- name: Run Integration Tests - name: Run Integration Tests
shell: bash shell: bash
run: | run: |
uv run --no-sync ./scripts/integration-tests.sh \ SCRIPT_ARGS="--stack-config ${{ inputs.stack-config }} --inference-mode ${{ inputs.inference-mode }}"
--stack-config '${{ inputs.stack-config }}' \
--provider '${{ inputs.provider }}' \ # Add optional arguments only if they are provided
--test-subdirs '${{ inputs.test-subdirs }}' \ if [ -n '${{ inputs.setup }}' ]; then
--test-pattern '${{ inputs.test-pattern }}' \ SCRIPT_ARGS="$SCRIPT_ARGS --setup ${{ inputs.setup }}"
--inference-mode '${{ inputs.inference-mode }}' \ fi
${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \ if [ -n '${{ inputs.suite }}' ]; then
| tee pytest-${{ inputs.inference-mode }}.log SCRIPT_ARGS="$SCRIPT_ARGS --suite ${{ inputs.suite }}"
fi
if [ -n '${{ inputs.subdirs }}' ]; then
SCRIPT_ARGS="$SCRIPT_ARGS --subdirs ${{ inputs.subdirs }}"
fi
if [ -n '${{ inputs.pattern }}' ]; then
SCRIPT_ARGS="$SCRIPT_ARGS --pattern ${{ inputs.pattern }}"
fi
uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS | tee pytest-${{ inputs.inference-mode }}.log
- name: Commit and push recordings - name: Commit and push recordings
@ -57,12 +68,7 @@ runs:
echo "New recordings detected, committing and pushing" echo "New recordings detected, committing and pushing"
git add tests/integration/recordings/ git add tests/integration/recordings/
if [ "${{ inputs.run-vision-tests }}" == "true" ]; then git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
git commit -m "Recordings update from CI (vision)"
else
git commit -m "Recordings update from CI"
fi
git fetch origin ${{ github.ref_name }} git fetch origin ${{ github.ref_name }}
git rebase origin/${{ github.ref_name }} git rebase origin/${{ github.ref_name }}
echo "Rebased successfully" echo "Rebased successfully"

View file

@ -1,17 +1,17 @@
name: Setup Ollama name: Setup Ollama
description: Start Ollama description: Start Ollama
inputs: inputs:
run-vision-tests: suite:
description: 'Run vision tests: "true" or "false"' description: 'Test suite to use: base, responses, vision, etc.'
required: false required: false
default: 'false' default: ''
runs: runs:
using: "composite" using: "composite"
steps: steps:
- name: Start Ollama - name: Start Ollama
shell: bash shell: bash
run: | run: |
if [ "${{ inputs.run-vision-tests }}" == "true" ]; then if [ "${{ inputs.suite }}" == "vision" ]; then
image="ollama-with-vision-model" image="ollama-with-vision-model"
else else
image="ollama-with-models" image="ollama-with-models"

View file

@ -8,14 +8,14 @@ inputs:
client-version: client-version:
description: 'Client version (latest or published)' description: 'Client version (latest or published)'
required: true required: true
provider: setup:
description: 'Provider to setup (ollama or vllm)' description: 'Setup to configure (ollama, vllm, gpt, etc.)'
required: true
default: 'ollama'
run-vision-tests:
description: 'Whether to setup provider for vision tests'
required: false required: false
default: 'false' default: 'ollama'
suite:
description: 'Test suite to use: base, responses, vision, etc.'
required: false
default: ''
inference-mode: inference-mode:
description: 'Inference mode (record or replay)' description: 'Inference mode (record or replay)'
required: true required: true
@ -30,13 +30,13 @@ runs:
client-version: ${{ inputs.client-version }} client-version: ${{ inputs.client-version }}
- name: Setup ollama - name: Setup ollama
if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }} if: ${{ (inputs.setup == 'ollama' || inputs.setup == 'ollama-vision') && inputs.inference-mode == 'record' }}
uses: ./.github/actions/setup-ollama uses: ./.github/actions/setup-ollama
with: with:
run-vision-tests: ${{ inputs.run-vision-tests }} suite: ${{ inputs.suite }}
- name: Setup vllm - name: Setup vllm
if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }} if: ${{ inputs.setup == 'vllm' && inputs.inference-mode == 'record' }}
uses: ./.github/actions/setup-vllm uses: ./.github/actions/setup-vllm
- name: Build Llama Stack - name: Build Llama Stack

View file

@ -5,10 +5,11 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
| Name | File | Purpose | | Name | File | Purpose |
| ---- | ---- | ------- | | ---- | ---- | ------- |
| Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md | | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
| API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. |
| Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script | | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
| Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication | | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
| SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore | | SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode | | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
| Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers | | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
| Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks | | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
| Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build | | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |

57
.github/workflows/conformance.yml vendored Normal file
View file

@ -0,0 +1,57 @@
# API Conformance Tests
# This workflow ensures that API changes maintain backward compatibility and don't break existing integrations
# It runs schema validation and OpenAPI diff checks to catch breaking changes early
name: API Conformance Tests
run-name: Run the API Conformance test suite on the changes.
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
types: [opened, synchronize, reopened]
paths:
- 'llama_stack/**'
- '!llama_stack/ui/**'
- 'tests/**'
- 'uv.lock'
- 'pyproject.toml'
- '.github/workflows/conformance.yml' # This workflow itself
concurrency:
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
# Cancel in-progress runs when new commits are pushed to avoid wasting CI resources
cancel-in-progress: true
jobs:
# Job to check if API schema changes maintain backward compatibility
check-schema-compatibility:
runs-on: ubuntu-latest
steps:
# Using specific version 4.1.7 because 5.0.0 fails when trying to run this locally using `act`
# This ensures consistent behavior between local testing and CI
- name: Checkout PR Code
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
# Checkout the base branch to compare against (usually main)
# This allows us to diff the current changes against the previous state
- name: Checkout Base Branch
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
ref: ${{ github.event.pull_request.base.ref }}
path: 'base'
# Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
- name: Install oasdiff
run: |
curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
# Run oasdiff to detect breaking changes in the API specification
# This step will fail if incompatible changes are detected, preventing breaking changes from being merged
- name: Run OpenAPI Breaking Change Diff
run: |
oasdiff breaking --fail-on ERR base/docs/_static/llama-stack-spec.yaml docs/_static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \
--match-path '^/v1/vector-io' \
--match-path '^/v1/vector-dbs'

View file

@ -1,6 +1,6 @@
name: Integration Tests (Replay) name: Integration Tests (Replay)
run-name: Run the integration test suite from tests/integration in replay mode run-name: Run the integration test suites from tests/integration in replay mode
on: on:
push: push:
@ -28,18 +28,10 @@ on:
description: 'Test against both the latest and published versions' description: 'Test against both the latest and published versions'
type: boolean type: boolean
default: false default: false
test-provider: test-setup:
description: 'Test against a specific provider' description: 'Test against a specific setup'
type: string type: string
default: 'ollama' default: 'ollama'
test-subdirs:
description: 'Comma-separated list of test subdirectories to run'
type: string
default: ''
test-pattern:
description: 'Regex pattern to pass to pytest -k'
type: string
default: ''
concurrency: concurrency:
# Skip concurrency for pushes to main - each commit should be tested independently # Skip concurrency for pushes to main - each commit should be tested independently
@ -50,18 +42,18 @@ jobs:
run-replay-mode-tests: run-replay-mode-tests:
runs-on: ubuntu-latest runs-on: ubuntu-latest
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }} name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
client-type: [library, server] client-type: [library, server]
# Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama) # Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }} setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
# Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12 # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }} python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }} client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
run-vision-tests: [true, false] suite: [base, vision]
steps: steps:
- name: Checkout repository - name: Checkout repository
@ -72,16 +64,14 @@ jobs:
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
client-version: ${{ matrix.client-version }} client-version: ${{ matrix.client-version }}
provider: ${{ matrix.provider }} setup: ${{ matrix.setup }}
run-vision-tests: ${{ matrix.run-vision-tests }} suite: ${{ matrix.suite }}
inference-mode: 'replay' inference-mode: 'replay'
- name: Run tests - name: Run tests
uses: ./.github/actions/run-and-record-tests uses: ./.github/actions/run-and-record-tests
with: with:
test-subdirs: ${{ inputs.test-subdirs }}
test-pattern: ${{ inputs.test-pattern }}
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }} stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
provider: ${{ matrix.provider }} setup: ${{ matrix.setup }}
inference-mode: 'replay' inference-mode: 'replay'
run-vision-tests: ${{ matrix.run-vision-tests }} suite: ${{ matrix.suite }}

View file

@ -28,7 +28,7 @@ jobs:
fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }} fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }}
- name: Set up Python - name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with: with:
python-version: '3.12' python-version: '3.12'
cache: pip cache: pip
@ -37,7 +37,7 @@ jobs:
.pre-commit-config.yaml .pre-commit-config.yaml
- name: Set up Node.js - name: Set up Node.js
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
with: with:
node-version: '20' node-version: '20'
cache: 'npm' cache: 'npm'
@ -48,7 +48,6 @@ jobs:
working-directory: llama_stack/ui working-directory: llama_stack/ui
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
continue-on-error: true
env: env:
SKIP: no-commit-to-branch SKIP: no-commit-to-branch
RUFF_OUTPUT_FORMAT: github RUFF_OUTPUT_FORMAT: github

View file

@ -24,7 +24,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@4959332f0f014c5280e7eac8b70c90cb574c9f9b # v6.6.0 uses: astral-sh/setup-uv@557e51de59eb14aaaba2ed9621916900a91d50c6 # v6.6.1
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
activate-environment: true activate-environment: true

View file

@ -10,19 +10,19 @@ run-name: Run the integration test suite from tests/integration
on: on:
workflow_dispatch: workflow_dispatch:
inputs: inputs:
test-subdirs: test-setup:
description: 'Comma-separated list of test subdirectories to run' description: 'Test against a specific setup'
type: string
default: ''
test-provider:
description: 'Test against a specific provider'
type: string type: string
default: 'ollama' default: 'ollama'
run-vision-tests: suite:
description: 'Whether to run vision tests' description: 'Test suite to use: base, responses, vision, etc.'
type: boolean type: string
default: false default: ''
test-pattern: subdirs:
description: 'Comma-separated list of test subdirectories to run; overrides suite'
type: string
default: ''
pattern:
description: 'Regex pattern to pass to pytest -k' description: 'Regex pattern to pass to pytest -k'
type: string type: string
default: '' default: ''
@ -38,11 +38,11 @@ jobs:
- name: Echo workflow inputs - name: Echo workflow inputs
run: | run: |
echo "::group::Workflow Inputs" echo "::group::Workflow Inputs"
echo "test-subdirs: ${{ inputs.test-subdirs }}"
echo "test-provider: ${{ inputs.test-provider }}"
echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
echo "test-pattern: ${{ inputs.test-pattern }}"
echo "branch: ${{ github.ref_name }}" echo "branch: ${{ github.ref_name }}"
echo "test-setup: ${{ inputs.test-setup }}"
echo "suite: ${{ inputs.suite }}"
echo "subdirs: ${{ inputs.subdirs }}"
echo "pattern: ${{ inputs.pattern }}"
echo "::endgroup::" echo "::endgroup::"
- name: Checkout repository - name: Checkout repository
@ -55,16 +55,16 @@ jobs:
with: with:
python-version: "3.12" # Use single Python version for recording python-version: "3.12" # Use single Python version for recording
client-version: "latest" client-version: "latest"
provider: ${{ inputs.test-provider || 'ollama' }} setup: ${{ inputs.test-setup || 'ollama' }}
run-vision-tests: ${{ inputs.run-vision-tests }} suite: ${{ inputs.suite }}
inference-mode: 'record' inference-mode: 'record'
- name: Run and record tests - name: Run and record tests
uses: ./.github/actions/run-and-record-tests uses: ./.github/actions/run-and-record-tests
with: with:
test-pattern: ${{ inputs.test-pattern }}
test-subdirs: ${{ inputs.test-subdirs }}
stack-config: 'server:ci-tests' # recording must be done with server since more tests are run stack-config: 'server:ci-tests' # recording must be done with server since more tests are run
provider: ${{ inputs.test-provider || 'ollama' }} setup: ${{ inputs.test-setup || 'ollama' }}
inference-mode: 'record' inference-mode: 'record'
run-vision-tests: ${{ inputs.run-vision-tests }} suite: ${{ inputs.suite }}
subdirs: ${{ inputs.subdirs }}
pattern: ${{ inputs.pattern }}

View file

@ -24,7 +24,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Stale Action - name: Stale Action
uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
with: with:
stale-issue-label: 'stale' stale-issue-label: 'stale'
stale-issue-message: > stale-issue-message: >

View file

@ -29,7 +29,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Setup Node.js - name: Setup Node.js
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
with: with:
node-version: ${{ matrix.node-version }} node-version: ${{ matrix.node-version }}
cache: 'npm' cache: 'npm'

2
.gitignore vendored
View file

@ -26,5 +26,7 @@ venv/
pytest-report.xml pytest-report.xml
.coverage .coverage
.python-version .python-version
AGENTS.md
server.log
CLAUDE.md CLAUDE.md
.claude/ .claude/

View file

@ -86,7 +86,7 @@ repos:
language: python language: python
pass_filenames: false pass_filenames: false
require_serial: true require_serial: true
files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$ files: ^llama_stack/distributions/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
- id: provider-codegen - id: provider-codegen
name: Provider Codegen name: Provider Codegen
additional_dependencies: additional_dependencies:

View file

@ -1,5 +1,103 @@
# Changelog # Changelog
# v0.2.20
Published on: 2025-08-29T22:25:32Z
Here are some key changes that are coming as part of this release.
### Build and Environment
- Environment improvements: fixed env var replacement to preserve types.
- Docker stability: fixed container startup failures for Fireworks AI provider.
- Removed absolute paths in build for better portability.
### Features
- UI Enhancements: Implemented file upload and VectorDB creation/configuration directly in UI.
- Vector Store Improvements: Added keyword, vector, and hybrid search inside vector store.
- Added S3 authorization support for file providers.
- SQL Store: Added inequality support to where clause.
### Documentation
- Fixed post-training docs.
- Added Contributor Guidelines for creating Internal vs. External providers.
### Fixes
- Removed unsupported bfcl scoring function.
- Multiple reliability and configuration fixes for providers and environment handling.
### Engineering / Chores
- Cleaner internal development setup with consistent paths.
- Incremental improvements to provider integration and vector store behavior.
### New Contributors
- @omertuc made their first contribution in #3270
- @r3v5 made their first contribution in vector store hybrid search
---
# v0.2.19
Published on: 2025-08-26T22:06:55Z
## Highlights
* feat: Add CORS configuration support for server by @skamenan7 in https://github.com/llamastack/llama-stack/pull/3201
* feat(api): introduce /rerank by @ehhuang in https://github.com/llamastack/llama-stack/pull/2940
* feat: Add S3 Files Provider by @mattf in https://github.com/llamastack/llama-stack/pull/3202
---
# v0.2.18
Published on: 2025-08-20T01:09:27Z
## Highlights
* Add moderations create API
* Hybrid search in Milvus
* Numerous Responses API improvements
* Documentation updates
---
# v0.2.17
Published on: 2025-08-05T01:51:14Z
## Highlights
* feat(tests): introduce inference record/replay to increase test reliability by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2941
* fix(library_client): improve initialization error handling and prevent AttributeError by @mattf in https://github.com/meta-llama/llama-stack/pull/2944
* fix: use OLLAMA_URL to activate Ollama provider in starter by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2963
* feat(UI): adding MVP playground UI by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2828
* Standardization of errors (@nathan-weinberg)
* feat: Enable DPO training with HuggingFace inline provider by @Nehanth in https://github.com/meta-llama/llama-stack/pull/2825
* chore: rename templates to distributions by @ashwinb in https://github.com/meta-llama/llama-stack/pull/3035
---
# v0.2.16
Published on: 2025-07-28T23:35:23Z
## Highlights
* Automatic model registration for self-hosted providers (ollama and vllm currently). No need for `INFERENCE_MODEL` environment variables which need to be updated, etc.
* Much simplified starter distribution. Most `ENABLE_` env variables are now gone. When you set `VLLM_URL`, the `vllm` provider is auto-enabled. Similar for `MILVUS_URL`, `PGVECTOR_DB`, etc. Check the [run.yaml](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/starter/run.yaml) for more details.
* All tests migrated to pytest now (thanks @Elbehery)
* DPO implementation in the post-training provider (thanks @Nehanth)
* (Huge!) Support for external APIs and providers thereof (thanks @leseb, @cdoern and others). This is a really big deal -- you can now add more APIs completely out of tree and experiment with them before (optionally) wanting to contribute back.
* `inline::vllm` provider is gone thank you very much
* several improvements to OpenAI inference implementations and LiteLLM backend (thanks @mattf)
* Chroma now supports Vector Store API (thanks @franciscojavierarceo).
* Authorization improvements: Vector Store/File APIs now supports access control (thanks @franciscojavierarceo); Telemetry read APIs are gated according to logged-in user's roles.
---
# v0.2.15 # v0.2.15
Published on: 2025-07-16T03:30:01Z Published on: 2025-07-16T03:30:01Z

View file

@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
**1. Deploy base k8s infrastructure:** **1. Deploy base k8s infrastructure:**
```bash ```bash
cd ../k8s cd ../../docs/source/distributions/k8s
./apply.sh ./apply.sh
``` ```
**2. Deploy benchmark components:** **2. Deploy benchmark components:**
```bash ```bash
cd ../k8s-benchmark
./apply.sh ./apply.sh
``` ```
@ -56,7 +55,6 @@ kubectl get pods
**Benchmark Llama Stack (default):** **Benchmark Llama Stack (default):**
```bash ```bash
cd docs/source/distributions/k8s-benchmark/
./run-benchmark.sh ./run-benchmark.sh
``` ```

View file

@ -14,7 +14,7 @@ import os
import random import random
import statistics import statistics
import time import time
from typing import Tuple
import aiohttp import aiohttp
@ -56,18 +56,10 @@ class BenchmarkStats:
total_time = self.end_time - self.start_time total_time = self.end_time - self.start_time
success_rate = (self.success_count / self.total_requests) * 100 success_rate = (self.success_count / self.total_requests) * 100
print(f"\n{'='*60}") print(f"\n{'=' * 60}")
print(f"BENCHMARK RESULTS") print("BENCHMARK RESULTS")
print(f"{'='*60}")
print(f"Total time: {total_time:.2f}s")
print(f"Concurrent users: {self.concurrent_users}")
print(f"Total requests: {self.total_requests}")
print(f"Successful requests: {self.success_count}")
print(f"Failed requests: {len(self.errors)}")
print(f"Success rate: {success_rate:.1f}%")
print(f"Requests per second: {self.success_count / total_time:.2f}")
print(f"\nResponse Time Statistics:") print("\nResponse Time Statistics:")
print(f" Mean: {statistics.mean(self.response_times):.3f}s") print(f" Mean: {statistics.mean(self.response_times):.3f}s")
print(f" Median: {statistics.median(self.response_times):.3f}s") print(f" Median: {statistics.median(self.response_times):.3f}s")
print(f" Min: {min(self.response_times):.3f}s") print(f" Min: {min(self.response_times):.3f}s")
@ -78,14 +70,14 @@ class BenchmarkStats:
percentiles = [50, 90, 95, 99] percentiles = [50, 90, 95, 99]
sorted_times = sorted(self.response_times) sorted_times = sorted(self.response_times)
print(f"\nPercentiles:") print("\nPercentiles:")
for p in percentiles: for p in percentiles:
idx = int(len(sorted_times) * p / 100) - 1 idx = int(len(sorted_times) * p / 100) - 1
idx = max(0, min(idx, len(sorted_times) - 1)) idx = max(0, min(idx, len(sorted_times) - 1))
print(f" P{p}: {sorted_times[idx]:.3f}s") print(f" P{p}: {sorted_times[idx]:.3f}s")
if self.ttft_times: if self.ttft_times:
print(f"\nTime to First Token (TTFT) Statistics:") print("\nTime to First Token (TTFT) Statistics:")
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s") print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
print(f" Median: {statistics.median(self.ttft_times):.3f}s") print(f" Median: {statistics.median(self.ttft_times):.3f}s")
print(f" Min: {min(self.ttft_times):.3f}s") print(f" Min: {min(self.ttft_times):.3f}s")
@ -95,26 +87,35 @@ class BenchmarkStats:
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s") print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
sorted_ttft = sorted(self.ttft_times) sorted_ttft = sorted(self.ttft_times)
print(f"\nTTFT Percentiles:") print("\nTTFT Percentiles:")
for p in percentiles: for p in percentiles:
idx = int(len(sorted_ttft) * p / 100) - 1 idx = int(len(sorted_ttft) * p / 100) - 1
idx = max(0, min(idx, len(sorted_ttft) - 1)) idx = max(0, min(idx, len(sorted_ttft) - 1))
print(f" P{p}: {sorted_ttft[idx]:.3f}s") print(f" P{p}: {sorted_ttft[idx]:.3f}s")
if self.chunks_received: if self.chunks_received:
print(f"\nStreaming Statistics:") print("\nStreaming Statistics:")
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}") print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
print(f" Total chunks received: {sum(self.chunks_received)}") print(f" Total chunks received: {sum(self.chunks_received)}")
print(f"{'=' * 60}")
print(f"Total time: {total_time:.2f}s")
print(f"Concurrent users: {self.concurrent_users}")
print(f"Total requests: {self.total_requests}")
print(f"Successful requests: {self.success_count}")
print(f"Failed requests: {len(self.errors)}")
print(f"Success rate: {success_rate:.1f}%")
print(f"Requests per second: {self.success_count / total_time:.2f}")
if self.errors: if self.errors:
print(f"\nErrors (showing first 5):") print("\nErrors (showing first 5):")
for error in self.errors[:5]: for error in self.errors[:5]:
print(f" {error}") print(f" {error}")
class LlamaStackBenchmark: class LlamaStackBenchmark:
def __init__(self, base_url: str, model_id: str): def __init__(self, base_url: str, model_id: str):
self.base_url = base_url.rstrip('/') self.base_url = base_url.rstrip("/")
self.model_id = model_id self.model_id = model_id
self.headers = {"Content-Type": "application/json"} self.headers = {"Content-Type": "application/json"}
self.test_messages = [ self.test_messages = [
@ -125,20 +126,14 @@ class LlamaStackBenchmark:
[ [
{"role": "user", "content": "What is machine learning?"}, {"role": "user", "content": "What is machine learning?"},
{"role": "assistant", "content": "Machine learning is a subset of AI..."}, {"role": "assistant", "content": "Machine learning is a subset of AI..."},
{"role": "user", "content": "Can you give me a practical example?"} {"role": "user", "content": "Can you give me a practical example?"},
] ],
] ]
async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
"""Make a single async streaming chat completion request.""" """Make a single async streaming chat completion request."""
messages = random.choice(self.test_messages) messages = random.choice(self.test_messages)
payload = { payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
"model": self.model_id,
"messages": messages,
"stream": True,
"max_tokens": 100
}
start_time = time.time() start_time = time.time()
chunks_received = 0 chunks_received = 0
@ -152,17 +147,17 @@ class LlamaStackBenchmark:
f"{self.base_url}/chat/completions", f"{self.base_url}/chat/completions",
headers=self.headers, headers=self.headers,
json=payload, json=payload,
timeout=aiohttp.ClientTimeout(total=30) timeout=aiohttp.ClientTimeout(total=30),
) as response: ) as response:
if response.status == 200: if response.status == 200:
async for line in response.content: async for line in response.content:
if line: if line:
line_str = line.decode('utf-8').strip() line_str = line.decode("utf-8").strip()
if line_str.startswith('data: '): if line_str.startswith("data: "):
chunks_received += 1 chunks_received += 1
if ttft is None: if ttft is None:
ttft = time.time() - start_time ttft = time.time() - start_time
if line_str == 'data: [DONE]': if line_str == "data: [DONE]":
break break
if chunks_received == 0: if chunks_received == 0:
@ -179,7 +174,6 @@ class LlamaStackBenchmark:
response_time = time.time() - start_time response_time = time.time() - start_time
return response_time, chunks_received, ttft, error return response_time, chunks_received, ttft, error
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats: async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
"""Run benchmark using async requests for specified duration.""" """Run benchmark using async requests for specified duration."""
stats = BenchmarkStats() stats = BenchmarkStats()
@ -191,7 +185,7 @@ class LlamaStackBenchmark:
print(f"Model: {self.model_id}") print(f"Model: {self.model_id}")
connector = aiohttp.TCPConnector(limit=concurrent_users) connector = aiohttp.TCPConnector(limit=concurrent_users)
async with aiohttp.ClientSession(connector=connector) as session: async with aiohttp.ClientSession(connector=connector):
async def worker(worker_id: int): async def worker(worker_id: int):
"""Worker that sends requests sequentially until canceled.""" """Worker that sends requests sequentially until canceled."""
@ -215,7 +209,9 @@ class LlamaStackBenchmark:
await asyncio.sleep(1) # Report every second await asyncio.sleep(1) # Report every second
if time.time() >= last_report_time + 10: # Report every 10 seconds if time.time() >= last_report_time + 10: # Report every 10 seconds
elapsed = time.time() - stats.start_time elapsed = time.time() - stats.start_time
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s") print(
f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
)
last_report_time = time.time() last_report_time = time.time()
except asyncio.CancelledError: except asyncio.CancelledError:
break break
@ -240,14 +236,16 @@ class LlamaStackBenchmark:
def main(): def main():
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool") parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"), parser.add_argument(
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)") "--base-url",
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"), default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
help="Model ID to use for requests") help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
parser.add_argument("--duration", type=int, default=60, )
help="Duration in seconds to run benchmark (default: 60)") parser.add_argument(
parser.add_argument("--concurrent", type=int, default=10, "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
help="Number of concurrent users (default: 10)") )
parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
args = parser.parse_args() args = parser.parse_args()

View file

@ -11,16 +11,18 @@ OpenAI-compatible mock server that returns:
- Valid OpenAI-formatted chat completion responses with dynamic content - Valid OpenAI-formatted chat completion responses with dynamic content
""" """
from flask import Flask, request, jsonify, Response
import time
import random
import uuid
import json
import argparse import argparse
import json
import os import os
import random
import time
import uuid
from flask import Flask, Response, jsonify, request
app = Flask(__name__) app = Flask(__name__)
# Models from environment variables # Models from environment variables
def get_models(): def get_models():
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct") models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
@ -29,40 +31,72 @@ def get_models():
return { return {
"object": "list", "object": "list",
"data": [ "data": [
{ {"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
"id": model_id, ],
"object": "model",
"created": 1234567890,
"owned_by": "vllm"
}
for model_id in model_ids
]
} }
def generate_random_text(length=50): def generate_random_text(length=50):
"""Generate random but coherent text for responses.""" """Generate random but coherent text for responses."""
words = [ words = [
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you", "Hello",
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what", "there",
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist", "I'm",
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more" "an",
"AI",
"assistant",
"ready",
"to",
"help",
"you",
"with",
"your",
"questions",
"and",
"tasks",
"today",
"Let",
"me",
"know",
"what",
"you'd",
"like",
"to",
"discuss",
"or",
"explore",
"together",
"I",
"can",
"assist",
"with",
"various",
"topics",
"including",
"coding",
"writing",
"analysis",
"and",
"more",
] ]
return " ".join(random.choices(words, k=length)) return " ".join(random.choices(words, k=length))
@app.route('/v1/models', methods=['GET'])
@app.route("/v1/models", methods=["GET"])
def list_models(): def list_models():
models = get_models() models = get_models()
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}") print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
return jsonify(models) return jsonify(models)
@app.route('/v1/chat/completions', methods=['POST'])
@app.route("/v1/chat/completions", methods=["POST"])
def chat_completions(): def chat_completions():
"""Return OpenAI-formatted chat completion responses.""" """Return OpenAI-formatted chat completion responses."""
data = request.get_json() data = request.get_json()
default_model = get_models()['data'][0]['id'] default_model = get_models()["data"][0]["id"]
model = data.get('model', default_model) model = data.get("model", default_model)
messages = data.get('messages', []) messages = data.get("messages", [])
stream = data.get('stream', False) stream = data.get("stream", False)
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}") print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
@ -71,11 +105,12 @@ def chat_completions():
else: else:
return handle_non_streaming_completion(model, messages) return handle_non_streaming_completion(model, messages)
def handle_non_streaming_completion(model, messages): def handle_non_streaming_completion(model, messages):
response_text = generate_random_text(random.randint(20, 80)) response_text = generate_random_text(random.randint(20, 80))
# Calculate realistic token counts # Calculate realistic token counts
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages) prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
completion_tokens = len(response_text.split()) completion_tokens = len(response_text.split())
response = { response = {
@ -83,25 +118,17 @@ def handle_non_streaming_completion(model, messages):
"object": "chat.completion", "object": "chat.completion",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
{
"index": 0,
"message": {
"role": "assistant",
"content": response_text
},
"finish_reason": "stop"
}
],
"usage": { "usage": {
"prompt_tokens": prompt_tokens, "prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens, "completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens,
} },
} }
return jsonify(response) return jsonify(response)
def handle_streaming_completion(model, messages): def handle_streaming_completion(model, messages):
def generate_stream(): def generate_stream():
# Generate response text # Generate response text
@ -114,12 +141,7 @@ def handle_streaming_completion(model, messages):
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
{
"index": 0,
"delta": {"role": "assistant", "content": ""}
}
]
} }
yield f"data: {json.dumps(initial_chunk)}\n\n" yield f"data: {json.dumps(initial_chunk)}\n\n"
@ -130,12 +152,7 @@ def handle_streaming_completion(model, messages):
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
{
"index": 0,
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
}
]
} }
yield f"data: {json.dumps(chunk)}\n\n" yield f"data: {json.dumps(chunk)}\n\n"
# Configurable delay to simulate realistic streaming # Configurable delay to simulate realistic streaming
@ -148,35 +165,30 @@ def handle_streaming_completion(model, messages):
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
{
"index": 0,
"delta": {"content": ""},
"finish_reason": "stop"
}
]
} }
yield f"data: {json.dumps(final_chunk)}\n\n" yield f"data: {json.dumps(final_chunk)}\n\n"
yield "data: [DONE]\n\n" yield "data: [DONE]\n\n"
return Response( return Response(
generate_stream(), generate_stream(),
mimetype='text/event-stream', mimetype="text/event-stream",
headers={ headers={
'Cache-Control': 'no-cache', "Cache-Control": "no-cache",
'Connection': 'keep-alive', "Connection": "keep-alive",
'Access-Control-Allow-Origin': '*', "Access-Control-Allow-Origin": "*",
} },
) )
@app.route('/health', methods=['GET'])
@app.route("/health", methods=["GET"])
def health(): def health():
return jsonify({"status": "healthy", "type": "openai-mock"}) return jsonify({"status": "healthy", "type": "openai-mock"})
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server') if __name__ == "__main__":
parser.add_argument('--port', type=int, default=8081, parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
help='Port to run the server on (default: 8081)') parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
args = parser.parse_args() args = parser.parse_args()
port = args.port port = args.port
@ -187,4 +199,4 @@ if __name__ == '__main__':
print("- OpenAI-formatted chat/completion responses with dynamic content") print("- OpenAI-formatted chat/completion responses with dynamic content")
print("- Streaming support with valid SSE format") print("- Streaming support with valid SSE format")
print(f"- Listening on: http://0.0.0.0:{port}") print(f"- Listening on: http://0.0.0.0:{port}")
app.run(host='0.0.0.0', port=port, debug=False) app.run(host="0.0.0.0", port=port, debug=False)

View file

@ -6,6 +6,7 @@ data:
apis: apis:
- agents - agents
- inference - inference
- files
- safety - safety
- telemetry - telemetry
- tool_runtime - tool_runtime
@ -19,13 +20,6 @@ data:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096} max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake} api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true} tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
config: {} config: {}
@ -41,6 +35,14 @@ data:
db: ${env.POSTGRES_DB:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack} user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard
@ -111,9 +113,6 @@ data:
- model_id: ${env.INFERENCE_MODEL} - model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference provider_id: vllm-inference
model_type: llm model_type: llm
- model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
model_type: llm
shields: shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: [] vector_dbs: []

View file

@ -2,7 +2,10 @@ version: '2'
image_name: kubernetes-benchmark-demo image_name: kubernetes-benchmark-demo
apis: apis:
- agents - agents
- files
- inference - inference
- files
- safety
- telemetry - telemetry
- tool_runtime - tool_runtime
- vector_io - vector_io
@ -18,6 +21,14 @@ providers:
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
config: {} config: {}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
vector_io: vector_io:
- provider_id: ${env.ENABLE_CHROMADB:+chromadb} - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
provider_type: remote::chromadb provider_type: remote::chromadb
@ -30,6 +41,19 @@ providers:
db: ${env.POSTGRES_DB:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack} user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents: agents:
- provider_id: meta-reference - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
@ -95,6 +119,8 @@ models:
- model_id: ${env.INFERENCE_MODEL} - model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference provider_id: vllm-inference
model_type: llm model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: [] vector_dbs: []
datasets: [] datasets: []
scoring_fns: [] scoring_fns: []

View file

@ -1,5 +1,106 @@
@import url("theme.css"); @import url("theme.css");
/* Horizontal Navigation Bar */
.horizontal-nav {
background-color: #ffffff;
border-bottom: 1px solid #e5e5e5;
padding: 0;
position: fixed;
top: 0;
left: 0;
right: 0;
z-index: 1050;
height: 50px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
[data-theme="dark"] .horizontal-nav {
background-color: #1a1a1a;
border-bottom: 1px solid #333;
}
.horizontal-nav .nav-container {
max-width: 1200px;
margin: 0 auto;
display: flex;
align-items: center;
justify-content: space-between;
padding: 0 20px;
height: 100%;
}
.horizontal-nav .nav-brand {
font-size: 18px;
font-weight: 600;
color: #333;
text-decoration: none;
}
[data-theme="dark"] .horizontal-nav .nav-brand {
color: #fff;
}
.horizontal-nav .nav-links {
display: flex;
align-items: center;
gap: 30px;
list-style: none;
margin: 0;
padding: 0;
}
.horizontal-nav .nav-links a {
color: #666;
text-decoration: none;
font-size: 14px;
font-weight: 500;
padding: 8px 12px;
border-radius: 6px;
transition: all 0.2s ease;
}
.horizontal-nav .nav-links a:hover,
.horizontal-nav .nav-links a.active {
color: #333;
background-color: #f5f5f5;
}
.horizontal-nav .nav-links a.active {
font-weight: 600;
}
[data-theme="dark"] .horizontal-nav .nav-links a {
color: #ccc;
}
[data-theme="dark"] .horizontal-nav .nav-links a:hover,
[data-theme="dark"] .horizontal-nav .nav-links a.active {
color: #fff;
background-color: #333;
}
.horizontal-nav .nav-links .github-link {
display: flex;
align-items: center;
gap: 6px;
}
.horizontal-nav .nav-links .github-icon {
width: 16px;
height: 16px;
fill: currentColor;
}
/* Adjust main content to account for fixed nav */
.wy-nav-side {
top: 50px;
height: calc(100vh - 50px);
}
.wy-nav-content-wrap {
margin-top: 50px;
}
.wy-nav-content { .wy-nav-content {
max-width: 90%; max-width: 90%;
} }

44
docs/_static/js/horizontal_nav.js vendored Normal file
View file

@ -0,0 +1,44 @@
// Horizontal Navigation Bar for Llama Stack Documentation
document.addEventListener('DOMContentLoaded', function() {
// Create the horizontal navigation HTML
const navHTML = `
<nav class="horizontal-nav">
<div class="nav-container">
<a href="/" class="nav-brand">Llama Stack</a>
<ul class="nav-links">
<li><a href="/">Docs</a></li>
<li><a href="/references/api_reference/">API Reference</a></li>
<li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
<svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
<path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
</svg>
GitHub
</a></li>
</ul>
</div>
</nav>
`;
// Insert the navigation at the beginning of the body
document.body.insertAdjacentHTML('afterbegin', navHTML);
// Update navigation links based on current page
updateActiveNav();
});
function updateActiveNav() {
const currentPath = window.location.pathname;
const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
navLinks.forEach(link => {
// Remove any existing active classes
link.classList.remove('active');
// Add active class based on current path
if (currentPath === '/' && link.getAttribute('href') === '/') {
link.classList.add('active');
} else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
link.classList.add('active');
}
});
}

View file

@ -633,6 +633,80 @@
} }
} }
}, },
"/v1/prompts": {
"get": {
"responses": {
"200": {
"description": "A ListPromptsResponse containing all prompts.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ListPromptsResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "List all prompts.",
"parameters": []
},
"post": {
"responses": {
"200": {
"description": "The created Prompt resource.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Prompt"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Create a new prompt.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CreatePromptRequest"
}
}
},
"required": true
}
}
},
"/v1/agents/{agent_id}": { "/v1/agents/{agent_id}": {
"get": { "get": {
"responses": { "responses": {
@ -901,6 +975,143 @@
] ]
} }
}, },
"/v1/prompts/{prompt_id}": {
"get": {
"responses": {
"200": {
"description": "A Prompt resource.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Prompt"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Get a prompt by its identifier and optional version.",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt to get.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "version",
"in": "query",
"description": "The version of the prompt to get (defaults to latest).",
"required": false,
"schema": {
"type": "integer"
}
}
]
},
"post": {
"responses": {
"200": {
"description": "The updated Prompt resource with incremented version.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Prompt"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Update an existing prompt (increments version).",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt to update.",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/UpdatePromptRequest"
}
}
},
"required": true
}
},
"delete": {
"responses": {
"200": {
"description": "OK"
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Delete a prompt.",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt to delete.",
"required": true,
"schema": {
"type": "string"
}
}
]
}
},
"/v1/inference/embeddings": { "/v1/inference/embeddings": {
"post": { "post": {
"responses": { "responses": {
@ -2836,6 +3047,49 @@
] ]
} }
}, },
"/v1/prompts/{prompt_id}/versions": {
"get": {
"responses": {
"200": {
"description": "A ListPromptsResponse containing all versions of the prompt.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ListPromptsResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "List all versions of a specific prompt.",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt to list versions for.",
"required": true,
"schema": {
"type": "string"
}
}
]
}
},
"/v1/providers": { "/v1/providers": {
"get": { "get": {
"responses": { "responses": {
@ -5007,6 +5261,59 @@
} }
} }
}, },
"/v1/prompts/{prompt_id}/set-default-version": {
"post": {
"responses": {
"200": {
"description": "The prompt with the specified version now set as default.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Prompt"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Set which version of a prompt should be the default in get_prompt (latest).",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt.",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SetDefaultVersionRequest"
}
}
},
"required": true
}
}
},
"/v1/post-training/supervised-fine-tune": { "/v1/post-training/supervised-fine-tune": {
"post": { "post": {
"responses": { "responses": {
@ -9670,6 +9977,65 @@
], ],
"title": "OpenAIResponseObjectStreamResponseWebSearchCallSearching" "title": "OpenAIResponseObjectStreamResponseWebSearchCallSearching"
}, },
"CreatePromptRequest": {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The prompt text content with variable placeholders."
},
"variables": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of variable names that can be used in the prompt template."
}
},
"additionalProperties": false,
"required": [
"prompt"
],
"title": "CreatePromptRequest"
},
"Prompt": {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The system prompt text with variable placeholders. Variables are only supported when using the Responses API."
},
"version": {
"type": "integer",
"description": "Version (integer starting at 1, incremented on save)"
},
"prompt_id": {
"type": "string",
"description": "Unique identifier formatted as 'pmpt_<48-digit-hash>'"
},
"variables": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of prompt variable names that can be used in the prompt template"
},
"is_default": {
"type": "boolean",
"default": false,
"description": "Boolean indicating whether this version is the default version for this prompt"
}
},
"additionalProperties": false,
"required": [
"version",
"prompt_id",
"variables",
"is_default"
],
"title": "Prompt",
"description": "A prompt resource representing a stored OpenAI Compatible prompt template in Llama Stack."
},
"OpenAIDeleteResponseObject": { "OpenAIDeleteResponseObject": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -10296,7 +10662,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "benchmark", "const": "benchmark",
"default": "benchmark", "default": "benchmark",
@ -10923,7 +11290,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "dataset", "const": "dataset",
"default": "dataset", "default": "dataset",
@ -11073,7 +11441,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "model", "const": "model",
"default": "model", "default": "model",
@ -11338,7 +11707,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "scoring_function", "const": "scoring_function",
"default": "scoring_function", "default": "scoring_function",
@ -11446,7 +11816,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "shield", "const": "shield",
"default": "shield", "default": "shield",
@ -11691,7 +12062,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "tool", "const": "tool",
"default": "tool", "default": "tool",
@ -11773,7 +12145,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "tool_group", "const": "tool_group",
"default": "tool_group", "default": "tool_group",
@ -12067,7 +12440,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "vector_db", "const": "vector_db",
"default": "vector_db", "default": "vector_db",
@ -12882,6 +13256,23 @@
"title": "OpenAIResponseObjectWithInput", "title": "OpenAIResponseObjectWithInput",
"description": "OpenAI response object extended with input context information." "description": "OpenAI response object extended with input context information."
}, },
"ListPromptsResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Prompt"
}
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "ListPromptsResponse",
"description": "Response model to list prompts."
},
"ListProvidersResponse": { "ListProvidersResponse": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -17129,6 +17520,20 @@
"title": "ScoreBatchResponse", "title": "ScoreBatchResponse",
"description": "Response from batch scoring operations on datasets." "description": "Response from batch scoring operations on datasets."
}, },
"SetDefaultVersionRequest": {
"type": "object",
"properties": {
"version": {
"type": "integer",
"description": "The version to set as default."
}
},
"additionalProperties": false,
"required": [
"version"
],
"title": "SetDefaultVersionRequest"
},
"AlgorithmConfig": { "AlgorithmConfig": {
"oneOf": [ "oneOf": [
{ {
@ -17413,6 +17818,37 @@
"title": "SyntheticDataGenerationResponse", "title": "SyntheticDataGenerationResponse",
"description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold." "description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."
}, },
"UpdatePromptRequest": {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The updated prompt text content."
},
"version": {
"type": "integer",
"description": "The current version of the prompt being updated."
},
"variables": {
"type": "array",
"items": {
"type": "string"
},
"description": "Updated list of variable names that can be used in the prompt template."
},
"set_as_default": {
"type": "boolean",
"description": "Set the new version as the default (default=True)."
}
},
"additionalProperties": false,
"required": [
"prompt",
"version",
"set_as_default"
],
"title": "UpdatePromptRequest"
},
"VersionInfo": { "VersionInfo": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -17538,6 +17974,10 @@
{ {
"name": "PostTraining (Coming Soon)" "name": "PostTraining (Coming Soon)"
}, },
{
"name": "Prompts",
"x-displayName": "Protocol for prompt management operations."
},
{ {
"name": "Providers", "name": "Providers",
"x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations." "x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
@ -17588,6 +18028,7 @@
"Inspect", "Inspect",
"Models", "Models",
"PostTraining (Coming Soon)", "PostTraining (Coming Soon)",
"Prompts",
"Providers", "Providers",
"Safety", "Safety",
"Scoring", "Scoring",

View file

@ -427,6 +427,58 @@ paths:
schema: schema:
$ref: '#/components/schemas/CreateOpenaiResponseRequest' $ref: '#/components/schemas/CreateOpenaiResponseRequest'
required: true required: true
/v1/prompts:
get:
responses:
'200':
description: >-
A ListPromptsResponse containing all prompts.
content:
application/json:
schema:
$ref: '#/components/schemas/ListPromptsResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: List all prompts.
parameters: []
post:
responses:
'200':
description: The created Prompt resource.
content:
application/json:
schema:
$ref: '#/components/schemas/Prompt'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: Create a new prompt.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/CreatePromptRequest'
required: true
/v1/agents/{agent_id}: /v1/agents/{agent_id}:
get: get:
responses: responses:
@ -616,6 +668,103 @@ paths:
required: true required: true
schema: schema:
type: string type: string
/v1/prompts/{prompt_id}:
get:
responses:
'200':
description: A Prompt resource.
content:
application/json:
schema:
$ref: '#/components/schemas/Prompt'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: >-
Get a prompt by its identifier and optional version.
parameters:
- name: prompt_id
in: path
description: The identifier of the prompt to get.
required: true
schema:
type: string
- name: version
in: query
description: >-
The version of the prompt to get (defaults to latest).
required: false
schema:
type: integer
post:
responses:
'200':
description: >-
The updated Prompt resource with incremented version.
content:
application/json:
schema:
$ref: '#/components/schemas/Prompt'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: >-
Update an existing prompt (increments version).
parameters:
- name: prompt_id
in: path
description: The identifier of the prompt to update.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/UpdatePromptRequest'
required: true
delete:
responses:
'200':
description: OK
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: Delete a prompt.
parameters:
- name: prompt_id
in: path
description: The identifier of the prompt to delete.
required: true
schema:
type: string
/v1/inference/embeddings: /v1/inference/embeddings:
post: post:
responses: responses:
@ -1983,6 +2132,37 @@ paths:
required: false required: false
schema: schema:
$ref: '#/components/schemas/Order' $ref: '#/components/schemas/Order'
/v1/prompts/{prompt_id}/versions:
get:
responses:
'200':
description: >-
A ListPromptsResponse containing all versions of the prompt.
content:
application/json:
schema:
$ref: '#/components/schemas/ListPromptsResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: List all versions of a specific prompt.
parameters:
- name: prompt_id
in: path
description: >-
The identifier of the prompt to list versions for.
required: true
schema:
type: string
/v1/providers: /v1/providers:
get: get:
responses: responses:
@ -3546,6 +3726,43 @@ paths:
schema: schema:
$ref: '#/components/schemas/ScoreBatchRequest' $ref: '#/components/schemas/ScoreBatchRequest'
required: true required: true
/v1/prompts/{prompt_id}/set-default-version:
post:
responses:
'200':
description: >-
The prompt with the specified version now set as default.
content:
application/json:
schema:
$ref: '#/components/schemas/Prompt'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: >-
Set which version of a prompt should be the default in get_prompt (latest).
parameters:
- name: prompt_id
in: path
description: The identifier of the prompt.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/SetDefaultVersionRequest'
required: true
/v1/post-training/supervised-fine-tune: /v1/post-training/supervised-fine-tune:
post: post:
responses: responses:
@ -7148,6 +7365,61 @@ components:
- type - type
title: >- title: >-
OpenAIResponseObjectStreamResponseWebSearchCallSearching OpenAIResponseObjectStreamResponseWebSearchCallSearching
CreatePromptRequest:
type: object
properties:
prompt:
type: string
description: >-
The prompt text content with variable placeholders.
variables:
type: array
items:
type: string
description: >-
List of variable names that can be used in the prompt template.
additionalProperties: false
required:
- prompt
title: CreatePromptRequest
Prompt:
type: object
properties:
prompt:
type: string
description: >-
The system prompt text with variable placeholders. Variables are only
supported when using the Responses API.
version:
type: integer
description: >-
Version (integer starting at 1, incremented on save)
prompt_id:
type: string
description: >-
Unique identifier formatted as 'pmpt_<48-digit-hash>'
variables:
type: array
items:
type: string
description: >-
List of prompt variable names that can be used in the prompt template
is_default:
type: boolean
default: false
description: >-
Boolean indicating whether this version is the default version for this
prompt
additionalProperties: false
required:
- version
- prompt_id
- variables
- is_default
title: Prompt
description: >-
A prompt resource representing a stored OpenAI Compatible prompt template
in Llama Stack.
OpenAIDeleteResponseObject: OpenAIDeleteResponseObject:
type: object type: object
properties: properties:
@ -7621,6 +7893,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: benchmark const: benchmark
default: benchmark default: benchmark
description: The resource type, always benchmark description: The resource type, always benchmark
@ -8107,6 +8380,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: dataset const: dataset
default: dataset default: dataset
description: >- description: >-
@ -8219,6 +8493,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: model const: model
default: model default: model
description: >- description: >-
@ -8410,6 +8685,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: scoring_function const: scoring_function
default: scoring_function default: scoring_function
description: >- description: >-
@ -8486,6 +8762,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: shield const: shield
default: shield default: shield
description: The resource type, always shield description: The resource type, always shield
@ -8665,6 +8942,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: tool const: tool
default: tool default: tool
description: Type of resource, always 'tool' description: Type of resource, always 'tool'
@ -8723,6 +9001,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: tool_group const: tool_group
default: tool_group default: tool_group
description: Type of resource, always 'tool_group' description: Type of resource, always 'tool_group'
@ -8951,6 +9230,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: vector_db const: vector_db
default: vector_db default: vector_db
description: >- description: >-
@ -9577,6 +9857,18 @@ components:
title: OpenAIResponseObjectWithInput title: OpenAIResponseObjectWithInput
description: >- description: >-
OpenAI response object extended with input context information. OpenAI response object extended with input context information.
ListPromptsResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/Prompt'
additionalProperties: false
required:
- data
title: ListPromptsResponse
description: Response model to list prompts.
ListProvidersResponse: ListProvidersResponse:
type: object type: object
properties: properties:
@ -12723,6 +13015,16 @@ components:
title: ScoreBatchResponse title: ScoreBatchResponse
description: >- description: >-
Response from batch scoring operations on datasets. Response from batch scoring operations on datasets.
SetDefaultVersionRequest:
type: object
properties:
version:
type: integer
description: The version to set as default.
additionalProperties: false
required:
- version
title: SetDefaultVersionRequest
AlgorithmConfig: AlgorithmConfig:
oneOf: oneOf:
- $ref: '#/components/schemas/LoraFinetuningConfig' - $ref: '#/components/schemas/LoraFinetuningConfig'
@ -12919,6 +13221,32 @@ components:
description: >- description: >-
Response from the synthetic data generation. Batch of (prompt, response, score) Response from the synthetic data generation. Batch of (prompt, response, score)
tuples that pass the threshold. tuples that pass the threshold.
UpdatePromptRequest:
type: object
properties:
prompt:
type: string
description: The updated prompt text content.
version:
type: integer
description: >-
The current version of the prompt being updated.
variables:
type: array
items:
type: string
description: >-
Updated list of variable names that can be used in the prompt template.
set_as_default:
type: boolean
description: >-
Set the new version as the default (default=True).
additionalProperties: false
required:
- prompt
- version
- set_as_default
title: UpdatePromptRequest
VersionInfo: VersionInfo:
type: object type: object
properties: properties:
@ -13030,6 +13358,9 @@ tags:
- name: Inspect - name: Inspect
- name: Models - name: Models
- name: PostTraining (Coming Soon) - name: PostTraining (Coming Soon)
- name: Prompts
x-displayName: >-
Protocol for prompt management operations.
- name: Providers - name: Providers
x-displayName: >- x-displayName: >-
Providers API for inspecting, listing, and modifying providers and their configurations. Providers API for inspecting, listing, and modifying providers and their configurations.
@ -13057,6 +13388,7 @@ x-tagGroups:
- Inspect - Inspect
- Models - Models
- PostTraining (Coming Soon) - PostTraining (Coming Soon)
- Prompts
- Providers - Providers
- Safety - Safety
- Scoring - Scoring

View file

@ -131,6 +131,7 @@ html_static_path = ["../_static"]
def setup(app): def setup(app):
app.add_css_file("css/my_theme.css") app.add_css_file("css/my_theme.css")
app.add_js_file("js/detect_theme.js") app.add_js_file("js/detect_theme.js")
app.add_js_file("js/horizontal_nav.js")
app.add_js_file("js/keyboard_shortcuts.js") app.add_js_file("js/keyboard_shortcuts.js")
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]): def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):

View file

@ -35,5 +35,5 @@ testing/record-replay
### Benchmarking ### Benchmarking
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md ```{include} ../../../benchmarking/k8s-benchmark/README.md
``` ```

View file

@ -40,18 +40,15 @@ The system patches OpenAI and Ollama client methods to intercept calls before th
### Storage Architecture ### Storage Architecture
Recordings use a two-tier storage system optimized for both speed and debuggability: Recordings are stored as JSON files in the recording directory. They are looked up by their request hash.
``` ```
recordings/ recordings/
├── index.sqlite # Fast lookup by request hash
└── responses/ └── responses/
├── abc123def456.json # Individual response files ├── abc123def456.json # Individual response files
└── def789ghi012.json └── def789ghi012.json
``` ```
**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
**JSON files** store complete request/response pairs in human-readable format for debugging. **JSON files** store complete request/response pairs in human-readable format for debugging.
## Recording Modes ## Recording Modes
@ -166,8 +163,8 @@ This preserves type safety - when replayed, you get the same Pydantic objects wi
Control recording behavior globally: Control recording behavior globally:
```bash ```bash
export LLAMA_STACK_TEST_INFERENCE_MODE=replay export LLAMA_STACK_TEST_INFERENCE_MODE=replay # this is the default
export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings # default is tests/integration/recordings
pytest tests/integration/ pytest tests/integration/
``` ```

View file

@ -354,6 +354,47 @@ You can easily validate a request by running:
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
``` ```
#### Kubernetes Authentication Provider
The server can be configured to use Kubernetes SelfSubjectReview API to validate tokens directly against the Kubernetes API server:
```yaml
server:
auth:
provider_config:
type: "kubernetes"
api_server_url: "https://kubernetes.default.svc"
claims_mapping:
username: "roles"
groups: "roles"
uid: "uid_attr"
verify_tls: true
tls_cafile: "/path/to/ca.crt"
```
Configuration options:
- `api_server_url`: The Kubernetes API server URL (e.g., https://kubernetes.default.svc:6443)
- `verify_tls`: Whether to verify TLS certificates (default: true)
- `tls_cafile`: Path to CA certificate file for TLS verification
- `claims_mapping`: Mapping of Kubernetes user claims to access attributes
The provider validates tokens by sending a SelfSubjectReview request to the Kubernetes API server at `/apis/authentication.k8s.io/v1/selfsubjectreviews`. The provider extracts user information from the response:
- Username from the `userInfo.username` field
- Groups from the `userInfo.groups` field
- UID from the `userInfo.uid` field
To obtain a token for testing:
```bash
kubectl create namespace llama-stack
kubectl create serviceaccount llama-stack-auth -n llama-stack
kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
```
You can validate a request by running:
```bash
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
```
#### GitHub Token Provider #### GitHub Token Provider
Validates GitHub personal access tokens or OAuth tokens directly: Validates GitHub personal access tokens or OAuth tokens directly:
```yaml ```yaml

View file

@ -1,137 +1,55 @@
apiVersion: v1 apiVersion: v1
data: data:
stack_run_config.yaml: | stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
version: '2' inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
image_name: kubernetes-demo \ inference:\n - provider_id: vllm-inference\n provider_type: remote::vllm\n
apis: \ config:\n url: ${env.VLLM_URL:=http://localhost:8000/v1}\n max_tokens:
- agents ${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n tls_verify:
- inference ${env.VLLM_TLS_VERIFY:=true}\n - provider_id: vllm-safety\n provider_type:
- safety remote::vllm\n config:\n url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
- telemetry \ max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n
- tool_runtime \ tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n - provider_id: sentence-transformers\n
- vector_io \ provider_type: inline::sentence-transformers\n config: {}\n vector_io:\n
providers: \ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n provider_type: remote::chromadb\n
inference: \ config:\n url: ${env.CHROMADB_URL:=}\n kvstore:\n type: postgres\n
- provider_id: vllm-inference \ host: ${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n
provider_type: remote::vllm \ db: ${env.POSTGRES_DB:=llamastack}\n user: ${env.POSTGRES_USER:=llamastack}\n
config: \ password: ${env.POSTGRES_PASSWORD:=llamastack}\n files:\n - provider_id:
url: ${env.VLLM_URL:=http://localhost:8000/v1} meta-reference-files\n provider_type: inline::localfs\n config:\n storage_dir:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096} ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n metadata_store:\n
api_token: ${env.VLLM_API_TOKEN:=fake} \ type: sqlite\n db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
tls_verify: ${env.VLLM_TLS_VERIFY:=true} \ \n safety:\n - provider_id: llama-guard\n provider_type: inline::llama-guard\n
- provider_id: vllm-safety \ config:\n excluded_categories: []\n agents:\n - provider_id: meta-reference\n
provider_type: remote::vllm \ provider_type: inline::meta-reference\n config:\n persistence_store:\n
config: \ type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n port:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1} ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096} ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
api_token: ${env.VLLM_API_TOKEN:=fake} \ responses_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n
tls_verify: ${env.VLLM_TLS_VERIFY:=true} \ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n
- provider_id: sentence-transformers \ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
provider_type: inline::sentence-transformers \ telemetry:\n - provider_id: meta-reference\n provider_type: inline::meta-reference\n
config: {} \ config:\n service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n sinks:
vector_io: ${env.TELEMETRY_SINKS:=console}\n tool_runtime:\n - provider_id: brave-search\n
- provider_id: ${env.ENABLE_CHROMADB:+chromadb} \ provider_type: remote::brave-search\n config:\n api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
provider_type: remote::chromadb \ max_results: 3\n - provider_id: tavily-search\n provider_type: remote::tavily-search\n
config: \ config:\n api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n max_results:
url: ${env.CHROMADB_URL:=} 3\n - provider_id: rag-runtime\n provider_type: inline::rag-runtime\n config:
kvstore: {}\n - provider_id: model-context-protocol\n provider_type: remote::model-context-protocol\n
type: postgres \ config: {}\nmetadata_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n
host: ${env.POSTGRES_HOST:=localhost} \ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user:
port: ${env.POSTGRES_PORT:=5432} ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
db: ${env.POSTGRES_DB:=llamastack} \ table_name: llamastack_kvstore\ninference_store:\n type: postgres\n host:
user: ${env.POSTGRES_USER:=llamastack} ${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n
password: ${env.POSTGRES_PASSWORD:=llamastack} \ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
safety: metadata:\n embedding_dimension: 384\n model_id: all-MiniLM-L6-v2\n provider_id:
- provider_id: llama-guard sentence-transformers\n model_type: embedding\n- metadata: {}\n model_id: ${env.INFERENCE_MODEL}\n
provider_type: inline::llama-guard \ provider_id: vllm-inference\n model_type: llm\n- metadata: {}\n model_id:
config: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n provider_id: vllm-safety\n
excluded_categories: [] \ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
agents: []\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
- provider_id: meta-reference builtin::websearch\n provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
provider_type: inline::meta-reference \ provider_id: rag-runtime\nserver:\n port: 8321\n auth:\n provider_config:\n
config: \ type: github_token\n"
persistence_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
responses_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
metadata_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
table_name: llamastack_kvstore
inference_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
models:
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- metadata: {}
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
provider_id: vllm-safety
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
server:
port: 8321
auth:
provider_config:
type: github_token
kind: ConfigMap kind: ConfigMap
metadata: metadata:
creationTimestamp: null creationTimestamp: null

View file

@ -3,6 +3,7 @@ image_name: kubernetes-demo
apis: apis:
- agents - agents
- inference - inference
- files
- safety - safety
- telemetry - telemetry
- tool_runtime - tool_runtime
@ -38,6 +39,14 @@ providers:
db: ${env.POSTGRES_DB:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack} user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard

View file

@ -18,12 +18,13 @@ embedding_model_id = (
).identifier ).identifier
embedding_dimension = em.metadata["embedding_dimension"] embedding_dimension = em.metadata["embedding_dimension"]
_ = client.vector_dbs.register( vector_db = client.vector_dbs.register(
vector_db_id=vector_db_id, vector_db_id=vector_db_id,
embedding_model=embedding_model_id, embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension, embedding_dimension=embedding_dimension,
provider_id="faiss", provider_id="faiss",
) )
vector_db_id = vector_db.identifier
source = "https://www.paulgraham.com/greatwork.html" source = "https://www.paulgraham.com/greatwork.html"
print("rag_tool> Ingesting document:", source) print("rag_tool> Ingesting document:", source)
document = RAGDocument( document = RAGDocument(
@ -35,7 +36,7 @@ document = RAGDocument(
client.tool_runtime.rag_tool.insert( client.tool_runtime.rag_tool.insert(
documents=[document], documents=[document],
vector_db_id=vector_db_id, vector_db_id=vector_db_id,
chunk_size_in_tokens=50, chunk_size_in_tokens=100,
) )
agent = Agent( agent = Agent(
client, client,

View file

@ -8,3 +8,4 @@ Here's a list of known external providers that you can use with Llama Stack:
| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) | | KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) | | RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
| TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) | | TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
| MongoDB | VectorIO with MongoDB | Vector_IO | Remote | [mongodb-llama-stack](https://github.com/mongodb-partners/mongodb-llama-stack) |

View file

@ -15,8 +15,8 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
| `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE | | `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
| `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS | | `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
| `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE | | `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. | | `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. | | `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
| `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). | | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
## Sample Configuration ## Sample Configuration

View file

@ -15,8 +15,8 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
| `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE | | `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
| `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS | | `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
| `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE | | `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. | | `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. | | `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
| `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). | | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
## Sample Configuration ## Sample Configuration

View file

@ -79,3 +79,10 @@ class ConflictError(ValueError):
def __init__(self, message: str) -> None: def __init__(self, message: str) -> None:
super().__init__(message) super().__init__(message)
class TokenValidationError(ValueError):
"""raised when token validation fails during authentication"""
def __init__(self, message: str) -> None:
super().__init__(message)

View file

@ -102,6 +102,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
:cvar benchmarks: Benchmark suite management :cvar benchmarks: Benchmark suite management
:cvar tool_groups: Tool group organization :cvar tool_groups: Tool group organization
:cvar files: File storage and management :cvar files: File storage and management
:cvar prompts: Prompt versions and management
:cvar inspect: Built-in system inspection and introspection :cvar inspect: Built-in system inspection and introspection
""" """
@ -127,6 +128,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
benchmarks = "benchmarks" benchmarks = "benchmarks"
tool_groups = "tool_groups" tool_groups = "tool_groups"
files = "files" files = "files"
prompts = "prompts"
# built-in API # built-in API
inspect = "inspect" inspect = "inspect"

View file

@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .prompts import ListPromptsResponse, Prompt, Prompts
__all__ = ["Prompt", "Prompts", "ListPromptsResponse"]

View file

@ -0,0 +1,189 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import re
import secrets
from typing import Protocol, runtime_checkable
from pydantic import BaseModel, Field, field_validator, model_validator
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
class Prompt(BaseModel):
"""A prompt resource representing a stored OpenAI Compatible prompt template in Llama Stack.
:param prompt: The system prompt text with variable placeholders. Variables are only supported when using the Responses API.
:param version: Version (integer starting at 1, incremented on save)
:param prompt_id: Unique identifier formatted as 'pmpt_<48-digit-hash>'
:param variables: List of prompt variable names that can be used in the prompt template
:param is_default: Boolean indicating whether this version is the default version for this prompt
"""
prompt: str | None = Field(default=None, description="The system prompt with variable placeholders")
version: int = Field(description="Version (integer starting at 1, incremented on save)", ge=1)
prompt_id: str = Field(description="Unique identifier in format 'pmpt_<48-digit-hash>'")
variables: list[str] = Field(
default_factory=list, description="List of variable names that can be used in the prompt template"
)
is_default: bool = Field(
default=False, description="Boolean indicating whether this version is the default version"
)
@field_validator("prompt_id")
@classmethod
def validate_prompt_id(cls, prompt_id: str) -> str:
if not isinstance(prompt_id, str):
raise TypeError("prompt_id must be a string in format 'pmpt_<48-digit-hash>'")
if not prompt_id.startswith("pmpt_"):
raise ValueError("prompt_id must start with 'pmpt_' prefix")
hex_part = prompt_id[5:]
if len(hex_part) != 48:
raise ValueError("prompt_id must be in format 'pmpt_<48-digit-hash>' (48 lowercase hex chars)")
for char in hex_part:
if char not in "0123456789abcdef":
raise ValueError("prompt_id hex part must contain only lowercase hex characters [0-9a-f]")
return prompt_id
@field_validator("version")
@classmethod
def validate_version(cls, prompt_version: int) -> int:
if prompt_version < 1:
raise ValueError("version must be >= 1")
return prompt_version
@model_validator(mode="after")
def validate_prompt_variables(self):
"""Validate that all variables used in the prompt are declared in the variables list."""
if not self.prompt:
return self
prompt_variables = set(re.findall(r"{{\s*(\w+)\s*}}", self.prompt))
declared_variables = set(self.variables)
undeclared = prompt_variables - declared_variables
if undeclared:
raise ValueError(f"Prompt contains undeclared variables: {sorted(undeclared)}")
return self
@classmethod
def generate_prompt_id(cls) -> str:
# Generate 48 hex characters (24 bytes)
random_bytes = secrets.token_bytes(24)
hex_string = random_bytes.hex()
return f"pmpt_{hex_string}"
class ListPromptsResponse(BaseModel):
"""Response model to list prompts."""
data: list[Prompt]
@runtime_checkable
@trace_protocol
class Prompts(Protocol):
"""Protocol for prompt management operations."""
@webmethod(route="/prompts", method="GET")
async def list_prompts(self) -> ListPromptsResponse:
"""List all prompts.
:returns: A ListPromptsResponse containing all prompts.
"""
...
@webmethod(route="/prompts/{prompt_id}/versions", method="GET")
async def list_prompt_versions(
self,
prompt_id: str,
) -> ListPromptsResponse:
"""List all versions of a specific prompt.
:param prompt_id: The identifier of the prompt to list versions for.
:returns: A ListPromptsResponse containing all versions of the prompt.
"""
...
@webmethod(route="/prompts/{prompt_id}", method="GET")
async def get_prompt(
self,
prompt_id: str,
version: int | None = None,
) -> Prompt:
"""Get a prompt by its identifier and optional version.
:param prompt_id: The identifier of the prompt to get.
:param version: The version of the prompt to get (defaults to latest).
:returns: A Prompt resource.
"""
...
@webmethod(route="/prompts", method="POST")
async def create_prompt(
self,
prompt: str,
variables: list[str] | None = None,
) -> Prompt:
"""Create a new prompt.
:param prompt: The prompt text content with variable placeholders.
:param variables: List of variable names that can be used in the prompt template.
:returns: The created Prompt resource.
"""
...
@webmethod(route="/prompts/{prompt_id}", method="PUT")
async def update_prompt(
self,
prompt_id: str,
prompt: str,
version: int,
variables: list[str] | None = None,
set_as_default: bool = True,
) -> Prompt:
"""Update an existing prompt (increments version).
:param prompt_id: The identifier of the prompt to update.
:param prompt: The updated prompt text content.
:param version: The current version of the prompt being updated.
:param variables: Updated list of variable names that can be used in the prompt template.
:param set_as_default: Set the new version as the default (default=True).
:returns: The updated Prompt resource with incremented version.
"""
...
@webmethod(route="/prompts/{prompt_id}", method="DELETE")
async def delete_prompt(
self,
prompt_id: str,
) -> None:
"""Delete a prompt.
:param prompt_id: The identifier of the prompt to delete.
"""
...
@webmethod(route="/prompts/{prompt_id}/set-default-version", method="PUT")
async def set_default_version(
self,
prompt_id: str,
version: int,
) -> Prompt:
"""Set which version of a prompt should be the default in get_prompt (latest).
:param prompt_id: The identifier of the prompt.
:param version: The version to set as default.
:returns: The prompt with the specified version now set as default.
"""
...

View file

@ -19,6 +19,7 @@ class ResourceType(StrEnum):
benchmark = "benchmark" benchmark = "benchmark"
tool = "tool" tool = "tool"
tool_group = "tool_group" tool_group = "tool_group"
prompt = "prompt"
class Resource(BaseModel): class Resource(BaseModel):

View file

@ -45,6 +45,7 @@ from llama_stack.core.utils.dynamic import instantiate_class_type
from llama_stack.core.utils.exec import formulate_run_args, run_command from llama_stack.core.utils.exec import formulate_run_args, run_command
from llama_stack.core.utils.image_types import LlamaStackImageType from llama_stack.core.utils.image_types import LlamaStackImageType
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
DISTRIBS_PATH = Path(__file__).parent.parent.parent / "distributions" DISTRIBS_PATH = Path(__file__).parent.parent.parent / "distributions"
@ -294,6 +295,12 @@ def _generate_run_config(
if build_config.external_providers_dir if build_config.external_providers_dir
else EXTERNAL_PROVIDERS_DIR, else EXTERNAL_PROVIDERS_DIR,
) )
if not run_config.inference_store:
run_config.inference_store = SqliteSqlStoreConfig(
**SqliteSqlStoreConfig.sample_run_config(
__distro_dir__=(DISTRIBS_BASE_DIR / image_name).as_posix(), db_name="inference_store.db"
)
)
# build providers dict # build providers dict
provider_registry = get_provider_registry(build_config) provider_registry = get_provider_registry(build_config)
for api in apis: for api in apis:

View file

@ -7,6 +7,7 @@
from enum import StrEnum from enum import StrEnum
from pathlib import Path from pathlib import Path
from typing import Annotated, Any, Literal, Self from typing import Annotated, Any, Literal, Self
from urllib.parse import urlparse
from pydantic import BaseModel, Field, field_validator, model_validator from pydantic import BaseModel, Field, field_validator, model_validator
@ -212,6 +213,7 @@ class AuthProviderType(StrEnum):
OAUTH2_TOKEN = "oauth2_token" OAUTH2_TOKEN = "oauth2_token"
GITHUB_TOKEN = "github_token" GITHUB_TOKEN = "github_token"
CUSTOM = "custom" CUSTOM = "custom"
KUBERNETES = "kubernetes"
class OAuth2TokenAuthConfig(BaseModel): class OAuth2TokenAuthConfig(BaseModel):
@ -282,8 +284,45 @@ class GitHubTokenAuthConfig(BaseModel):
) )
class KubernetesAuthProviderConfig(BaseModel):
"""Configuration for Kubernetes authentication provider."""
type: Literal[AuthProviderType.KUBERNETES] = AuthProviderType.KUBERNETES
api_server_url: str = Field(
default="https://kubernetes.default.svc",
description="Kubernetes API server URL (e.g., https://api.cluster.domain:6443)",
)
verify_tls: bool = Field(default=True, description="Whether to verify TLS certificates")
tls_cafile: Path | None = Field(default=None, description="Path to CA certificate file for TLS verification")
claims_mapping: dict[str, str] = Field(
default_factory=lambda: {
"username": "roles",
"groups": "roles",
},
description="Mapping of Kubernetes user claims to access attributes",
)
@field_validator("api_server_url")
@classmethod
def validate_api_server_url(cls, v):
parsed = urlparse(v)
if not parsed.scheme or not parsed.netloc:
raise ValueError(f"api_server_url must be a valid URL with scheme and host: {v}")
if parsed.scheme not in ["http", "https"]:
raise ValueError(f"api_server_url scheme must be http or https: {v}")
return v
@field_validator("claims_mapping")
@classmethod
def validate_claims_mapping(cls, v):
for key, value in v.items():
if not value:
raise ValueError(f"claims_mapping value cannot be empty: {key}")
return v
AuthProviderConfig = Annotated[ AuthProviderConfig = Annotated[
OAuth2TokenAuthConfig | GitHubTokenAuthConfig | CustomAuthConfig, OAuth2TokenAuthConfig | GitHubTokenAuthConfig | CustomAuthConfig | KubernetesAuthProviderConfig,
Field(discriminator="type"), Field(discriminator="type"),
] ]
@ -392,6 +431,12 @@ class ServerConfig(BaseModel):
) )
class InferenceStoreConfig(BaseModel):
sql_store_config: SqlStoreConfig
max_write_queue_size: int = Field(default=10000, description="Max queued writes for inference store")
num_writers: int = Field(default=4, description="Number of concurrent background writers")
class StackRunConfig(BaseModel): class StackRunConfig(BaseModel):
version: int = LLAMA_STACK_RUN_CONFIG_VERSION version: int = LLAMA_STACK_RUN_CONFIG_VERSION
@ -425,11 +470,12 @@ Configuration for the persistence store used by the distribution registry. If no
a default SQLite store will be used.""", a default SQLite store will be used.""",
) )
inference_store: SqlStoreConfig | None = Field( inference_store: InferenceStoreConfig | SqlStoreConfig | None = Field(
default=None, default=None,
description=""" description="""
Configuration for the persistence store used by the inference API. If not specified, Configuration for the persistence store used by the inference API. Can be either a
a default SQLite store will be used.""", InferenceStoreConfig (with queue tuning parameters) or a SqlStoreConfig (deprecated).
If not specified, a default SQLite store will be used.""",
) )
# registry of "resources" in the distribution # registry of "resources" in the distribution

View file

@ -10,7 +10,6 @@ import json
import logging # allow-direct-logging import logging # allow-direct-logging
import os import os
import sys import sys
from concurrent.futures import ThreadPoolExecutor
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
@ -148,7 +147,6 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
self.async_client = AsyncLlamaStackAsLibraryClient( self.async_client = AsyncLlamaStackAsLibraryClient(
config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal
) )
self.pool_executor = ThreadPoolExecutor(max_workers=4)
self.provider_data = provider_data self.provider_data = provider_data
self.loop = asyncio.new_event_loop() self.loop = asyncio.new_event_loop()

View file

@ -0,0 +1,233 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import json
from typing import Any
from pydantic import BaseModel
from llama_stack.apis.prompts import ListPromptsResponse, Prompt, Prompts
from llama_stack.core.datatypes import StackRunConfig
from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
class PromptServiceConfig(BaseModel):
"""Configuration for the built-in prompt service.
:param run_config: Stack run configuration containing distribution info
"""
run_config: StackRunConfig
async def get_provider_impl(config: PromptServiceConfig, deps: dict[Any, Any]):
"""Get the prompt service implementation."""
impl = PromptServiceImpl(config, deps)
await impl.initialize()
return impl
class PromptServiceImpl(Prompts):
"""Built-in prompt service implementation using KVStore."""
def __init__(self, config: PromptServiceConfig, deps: dict[Any, Any]):
self.config = config
self.deps = deps
self.kvstore: KVStore
async def initialize(self) -> None:
kvstore_config = SqliteKVStoreConfig(
db_path=(DISTRIBS_BASE_DIR / self.config.run_config.image_name / "prompts.db").as_posix()
)
self.kvstore = await kvstore_impl(kvstore_config)
def _get_default_key(self, prompt_id: str) -> str:
"""Get the KVStore key that stores the default version number."""
return f"prompts:v1:{prompt_id}:default"
async def _get_prompt_key(self, prompt_id: str, version: int | None = None) -> str:
"""Get the KVStore key for prompt data, returning default version if applicable."""
if version:
return self._get_version_key(prompt_id, str(version))
default_key = self._get_default_key(prompt_id)
resolved_version = await self.kvstore.get(default_key)
if resolved_version is None:
raise ValueError(f"Prompt {prompt_id}:default not found")
return self._get_version_key(prompt_id, resolved_version)
def _get_version_key(self, prompt_id: str, version: str) -> str:
"""Get the KVStore key for a specific prompt version."""
return f"prompts:v1:{prompt_id}:{version}"
def _get_list_key_prefix(self) -> str:
"""Get the key prefix for listing prompts."""
return "prompts:v1:"
def _serialize_prompt(self, prompt: Prompt) -> str:
"""Serialize a prompt to JSON string for storage."""
return json.dumps(
{
"prompt_id": prompt.prompt_id,
"prompt": prompt.prompt,
"version": prompt.version,
"variables": prompt.variables or [],
"is_default": prompt.is_default,
}
)
def _deserialize_prompt(self, data: str) -> Prompt:
"""Deserialize a prompt from JSON string."""
obj = json.loads(data)
return Prompt(
prompt_id=obj["prompt_id"],
prompt=obj["prompt"],
version=obj["version"],
variables=obj.get("variables", []),
is_default=obj.get("is_default", False),
)
async def list_prompts(self) -> ListPromptsResponse:
"""List all prompts (default versions only)."""
prefix = self._get_list_key_prefix()
keys = await self.kvstore.keys_in_range(prefix, prefix + "\xff")
prompts = []
for key in keys:
if key.endswith(":default"):
try:
default_version = await self.kvstore.get(key)
if default_version:
prompt_id = key.replace(prefix, "").replace(":default", "")
version_key = self._get_version_key(prompt_id, default_version)
data = await self.kvstore.get(version_key)
if data:
prompt = self._deserialize_prompt(data)
prompts.append(prompt)
except (json.JSONDecodeError, KeyError):
continue
prompts.sort(key=lambda p: p.prompt_id or "", reverse=True)
return ListPromptsResponse(data=prompts)
async def get_prompt(self, prompt_id: str, version: int | None = None) -> Prompt:
"""Get a prompt by its identifier and optional version."""
key = await self._get_prompt_key(prompt_id, version)
data = await self.kvstore.get(key)
if data is None:
raise ValueError(f"Prompt {prompt_id}:{version if version else 'default'} not found")
return self._deserialize_prompt(data)
async def create_prompt(
self,
prompt: str,
variables: list[str] | None = None,
) -> Prompt:
"""Create a new prompt."""
if variables is None:
variables = []
prompt_obj = Prompt(
prompt_id=Prompt.generate_prompt_id(),
prompt=prompt,
version=1,
variables=variables,
)
version_key = self._get_version_key(prompt_obj.prompt_id, str(prompt_obj.version))
data = self._serialize_prompt(prompt_obj)
await self.kvstore.set(version_key, data)
default_key = self._get_default_key(prompt_obj.prompt_id)
await self.kvstore.set(default_key, str(prompt_obj.version))
return prompt_obj
async def update_prompt(
self,
prompt_id: str,
prompt: str,
version: int,
variables: list[str] | None = None,
set_as_default: bool = True,
) -> Prompt:
"""Update an existing prompt (increments version)."""
if version < 1:
raise ValueError("Version must be >= 1")
if variables is None:
variables = []
prompt_versions = await self.list_prompt_versions(prompt_id)
latest_prompt = max(prompt_versions.data, key=lambda x: int(x.version))
if version and latest_prompt.version != version:
raise ValueError(
f"'{version}' is not the latest prompt version for prompt_id='{prompt_id}'. Use the latest version '{latest_prompt.version}' in request."
)
current_version = latest_prompt.version if version is None else version
new_version = current_version + 1
updated_prompt = Prompt(prompt_id=prompt_id, prompt=prompt, version=new_version, variables=variables)
version_key = self._get_version_key(prompt_id, str(new_version))
data = self._serialize_prompt(updated_prompt)
await self.kvstore.set(version_key, data)
if set_as_default:
await self.set_default_version(prompt_id, new_version)
return updated_prompt
async def delete_prompt(self, prompt_id: str) -> None:
"""Delete a prompt and all its versions."""
await self.get_prompt(prompt_id)
prefix = f"prompts:v1:{prompt_id}:"
keys = await self.kvstore.keys_in_range(prefix, prefix + "\xff")
for key in keys:
await self.kvstore.delete(key)
async def list_prompt_versions(self, prompt_id: str) -> ListPromptsResponse:
"""List all versions of a specific prompt."""
prefix = f"prompts:v1:{prompt_id}:"
keys = await self.kvstore.keys_in_range(prefix, prefix + "\xff")
default_version = None
prompts = []
for key in keys:
data = await self.kvstore.get(key)
if key.endswith(":default"):
default_version = data
else:
if data:
prompt_obj = self._deserialize_prompt(data)
prompts.append(prompt_obj)
if not prompts:
raise ValueError(f"Prompt {prompt_id} not found")
for prompt in prompts:
prompt.is_default = str(prompt.version) == default_version
prompts.sort(key=lambda x: x.version)
return ListPromptsResponse(data=prompts)
async def set_default_version(self, prompt_id: str, version: int) -> Prompt:
"""Set which version of a prompt should be the default, If not set. the default is the latest."""
version_key = self._get_version_key(prompt_id, str(version))
data = await self.kvstore.get(version_key)
if data is None:
raise ValueError(f"Prompt {prompt_id} version {version} not found")
default_key = self._get_default_key(prompt_id)
await self.kvstore.set(default_key, str(version))
return self._deserialize_prompt(data)

View file

@ -19,6 +19,7 @@ from llama_stack.apis.inference import Inference, InferenceProvider
from llama_stack.apis.inspect import Inspect from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models from llama_stack.apis.models import Models
from llama_stack.apis.post_training import PostTraining from llama_stack.apis.post_training import PostTraining
from llama_stack.apis.prompts import Prompts
from llama_stack.apis.providers import Providers as ProvidersAPI from llama_stack.apis.providers import Providers as ProvidersAPI
from llama_stack.apis.safety import Safety from llama_stack.apis.safety import Safety
from llama_stack.apis.scoring import Scoring from llama_stack.apis.scoring import Scoring
@ -93,6 +94,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
Api.tool_groups: ToolGroups, Api.tool_groups: ToolGroups,
Api.tool_runtime: ToolRuntime, Api.tool_runtime: ToolRuntime,
Api.files: Files, Api.files: Files,
Api.prompts: Prompts,
} }
if external_apis: if external_apis:
@ -284,7 +286,15 @@ async def instantiate_providers(
if provider.provider_id is None: if provider.provider_id is None:
continue continue
deps = {a: impls[a] for a in provider.spec.api_dependencies} try:
deps = {a: impls[a] for a in provider.spec.api_dependencies}
except KeyError as e:
missing_api = e.args[0]
raise RuntimeError(
f"Failed to resolve '{provider.spec.api.value}' provider '{provider.provider_id}' of type '{provider.spec.provider_type}': "
f"required dependency '{missing_api.value}' is not available. "
f"Please add a '{missing_api.value}' provider to your configuration or check if the provider is properly configured."
) from e
for a in provider.spec.optional_api_dependencies: for a in provider.spec.optional_api_dependencies:
if a in impls: if a in impls:
deps[a] = impls[a] deps[a] = impls[a]

View file

@ -78,7 +78,10 @@ async def get_auto_router_impl(
# TODO: move pass configs to routers instead # TODO: move pass configs to routers instead
if api == Api.inference and run_config.inference_store: if api == Api.inference and run_config.inference_store:
inference_store = InferenceStore(run_config.inference_store, policy) inference_store = InferenceStore(
config=run_config.inference_store,
policy=policy,
)
await inference_store.initialize() await inference_store.initialize()
api_to_dep_impl["store"] = inference_store api_to_dep_impl["store"] = inference_store

View file

@ -63,7 +63,7 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack.models.llama.llama3.tokenizer import Tokenizer
from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
from llama_stack.providers.utils.inference.inference_store import InferenceStore from llama_stack.providers.utils.inference.inference_store import InferenceStore
from llama_stack.providers.utils.telemetry.tracing import get_current_span from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
logger = get_logger(name=__name__, category="core::routers") logger = get_logger(name=__name__, category="core::routers")
@ -90,6 +90,11 @@ class InferenceRouter(Inference):
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("InferenceRouter.shutdown") logger.debug("InferenceRouter.shutdown")
if self.store:
try:
await self.store.shutdown()
except Exception as e:
logger.warning(f"Error during InferenceStore shutdown: {e}")
async def register_model( async def register_model(
self, self,
@ -160,7 +165,7 @@ class InferenceRouter(Inference):
metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model) metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
if self.telemetry: if self.telemetry:
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics] return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
async def _count_tokens( async def _count_tokens(
@ -431,7 +436,7 @@ class InferenceRouter(Inference):
model=model_obj, model=model_obj,
) )
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
# these metrics will show up in the client response. # these metrics will show up in the client response.
response.metrics = ( response.metrics = (
@ -527,7 +532,7 @@ class InferenceRouter(Inference):
# Store the response with the ID that will be returned to the client # Store the response with the ID that will be returned to the client
if self.store: if self.store:
await self.store.store_chat_completion(response, messages) asyncio.create_task(self.store.store_chat_completion(response, messages))
if self.telemetry: if self.telemetry:
metrics = self._construct_metrics( metrics = self._construct_metrics(
@ -537,7 +542,7 @@ class InferenceRouter(Inference):
model=model_obj, model=model_obj,
) )
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
# these metrics will show up in the client response. # these metrics will show up in the client response.
response.metrics = ( response.metrics = (
metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
@ -664,7 +669,7 @@ class InferenceRouter(Inference):
"completion_tokens", "completion_tokens",
"total_tokens", "total_tokens",
]: # Only log completion and total tokens ]: # Only log completion and total tokens
await self.telemetry.log_event(metric) enqueue_event(metric)
# Return metrics in response # Return metrics in response
async_metrics = [ async_metrics = [
@ -710,7 +715,7 @@ class InferenceRouter(Inference):
) )
for metric in completion_metrics: for metric in completion_metrics:
if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens
await self.telemetry.log_event(metric) enqueue_event(metric)
# Return metrics in response # Return metrics in response
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics] return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
@ -755,7 +760,7 @@ class InferenceRouter(Inference):
choices_data[idx] = { choices_data[idx] = {
"content_parts": [], "content_parts": [],
"tool_calls_builder": {}, "tool_calls_builder": {},
"finish_reason": None, "finish_reason": "stop",
"logprobs_content_parts": [], "logprobs_content_parts": [],
} }
current_choice_data = choices_data[idx] current_choice_data = choices_data[idx]
@ -806,7 +811,7 @@ class InferenceRouter(Inference):
model=model, model=model,
) )
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
yield chunk yield chunk
finally: finally:
@ -855,4 +860,4 @@ class InferenceRouter(Inference):
object="chat.completion", object="chat.completion",
) )
logger.debug(f"InferenceRouter.completion_response: {final_response}") logger.debug(f"InferenceRouter.completion_response: {final_response}")
await self.store.store_chat_completion(final_response, messages) asyncio.create_task(self.store.store_chat_completion(final_response, messages))

View file

@ -53,6 +53,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
vector_db_name: str | None = None, vector_db_name: str | None = None,
) -> VectorDB: ) -> VectorDB:
provider_vector_db_id = provider_vector_db_id or vector_db_id provider_vector_db_id = provider_vector_db_id or vector_db_id
model = await lookup_model(self, embedding_model) model = await lookup_model(self, embedding_model)
if model is None: if model is None:
raise ModelNotFoundError(embedding_model) raise ModelNotFoundError(embedding_model)
@ -60,14 +61,33 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding) raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
if "embedding_dimension" not in model.metadata: if "embedding_dimension" not in model.metadata:
raise ValueError(f"Model {embedding_model} does not have an embedding dimension") raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
provider = self.impls_by_provider_id[provider_id]
logger.warning(
"VectorDB is being deprecated in future releases in favor of VectorStore. Please migrate your usage accordingly."
)
vector_store = await provider.openai_create_vector_store(
name=vector_db_name or vector_db_id,
embedding_model=embedding_model,
embedding_dimension=model.metadata["embedding_dimension"],
provider_id=provider_id,
provider_vector_db_id=provider_vector_db_id,
)
vector_store_id = vector_store.id
actual_provider_vector_db_id = provider_vector_db_id or vector_store_id
logger.warning(
f"Ignoring vector_db_id {vector_db_id} and using vector_store_id {vector_store_id} instead. Setting VectorDB {vector_db_id} to VectorDB.vector_db_name"
)
vector_db_data = { vector_db_data = {
"identifier": vector_db_id, "identifier": vector_store_id,
"type": ResourceType.vector_db.value, "type": ResourceType.vector_db.value,
"provider_id": provider_id, "provider_id": provider_id,
"provider_resource_id": provider_vector_db_id, "provider_resource_id": actual_provider_vector_db_id,
"embedding_model": embedding_model, "embedding_model": embedding_model,
"embedding_dimension": model.metadata["embedding_dimension"], "embedding_dimension": model.metadata["embedding_dimension"],
"vector_db_name": vector_db_name, "vector_db_name": vector_store.name,
} }
vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data) vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
await self.register_object(vector_db) await self.register_object(vector_db)

View file

@ -8,16 +8,18 @@ import ssl
import time import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from asyncio import Lock from asyncio import Lock
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urljoin, urlparse
import httpx import httpx
from jose import jwt from jose import jwt
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.common.errors import TokenValidationError
from llama_stack.core.datatypes import ( from llama_stack.core.datatypes import (
AuthenticationConfig, AuthenticationConfig,
CustomAuthConfig, CustomAuthConfig,
GitHubTokenAuthConfig, GitHubTokenAuthConfig,
KubernetesAuthProviderConfig,
OAuth2TokenAuthConfig, OAuth2TokenAuthConfig,
User, User,
) )
@ -162,7 +164,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
auth=auth, auth=auth,
timeout=10.0, # Add a reasonable timeout timeout=10.0, # Add a reasonable timeout
) )
if response.status_code != 200: if response.status_code != httpx.codes.OK:
logger.warning(f"Token introspection failed with status code: {response.status_code}") logger.warning(f"Token introspection failed with status code: {response.status_code}")
raise ValueError(f"Token introspection failed: {response.status_code}") raise ValueError(f"Token introspection failed: {response.status_code}")
@ -272,7 +274,7 @@ class CustomAuthProvider(AuthProvider):
json=auth_request.model_dump(), json=auth_request.model_dump(),
timeout=10.0, # Add a reasonable timeout timeout=10.0, # Add a reasonable timeout
) )
if response.status_code != 200: if response.status_code != httpx.codes.OK:
logger.warning(f"Authentication failed with status code: {response.status_code}") logger.warning(f"Authentication failed with status code: {response.status_code}")
raise ValueError(f"Authentication failed: {response.status_code}") raise ValueError(f"Authentication failed: {response.status_code}")
@ -374,6 +376,89 @@ async def _get_github_user_info(access_token: str, github_api_base_url: str) ->
} }
class KubernetesAuthProvider(AuthProvider):
"""
Kubernetes authentication provider that validates tokens using the Kubernetes SelfSubjectReview API.
This provider integrates with Kubernetes API server by using the
/apis/authentication.k8s.io/v1/selfsubjectreviews endpoint to validate tokens and extract user information.
"""
def __init__(self, config: KubernetesAuthProviderConfig):
self.config = config
def _httpx_verify_value(self) -> bool | str:
"""
Build the value for httpx's `verify` parameter.
- False disables verification.
- Path string points to a CA bundle.
- True uses system defaults.
"""
if not self.config.verify_tls:
return False
if self.config.tls_cafile:
return self.config.tls_cafile.as_posix()
return True
async def validate_token(self, token: str, scope: dict | None = None) -> User:
"""Validate a token using Kubernetes SelfSubjectReview API endpoint."""
# Build the Kubernetes SelfSubjectReview API endpoint URL
review_api_url = urljoin(self.config.api_server_url, "/apis/authentication.k8s.io/v1/selfsubjectreviews")
# Create SelfSubjectReview request body
review_request = {"apiVersion": "authentication.k8s.io/v1", "kind": "SelfSubjectReview"}
verify = self._httpx_verify_value()
try:
async with httpx.AsyncClient(verify=verify, timeout=10.0) as client:
response = await client.post(
review_api_url,
json=review_request,
headers={
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
},
)
if response.status_code == httpx.codes.UNAUTHORIZED:
raise TokenValidationError("Invalid token")
if response.status_code != httpx.codes.CREATED:
logger.warning(f"Kubernetes SelfSubjectReview API failed with status code: {response.status_code}")
raise TokenValidationError(f"Token validation failed: {response.status_code}")
review_response = response.json()
# Extract user information from SelfSubjectReview response
status = review_response.get("status", {})
if not status:
raise ValueError("No status found in SelfSubjectReview response")
user_info = status.get("userInfo", {})
if not user_info:
raise ValueError("No userInfo found in SelfSubjectReview response")
username = user_info.get("username")
if not username:
raise ValueError("No username found in SelfSubjectReview response")
# Build user attributes from Kubernetes user info
user_attributes = get_attributes_from_claims(user_info, self.config.claims_mapping)
return User(
principal=username,
attributes=user_attributes,
)
except httpx.TimeoutException:
logger.warning("Kubernetes SelfSubjectReview API request timed out")
raise ValueError("Token validation timeout") from None
except Exception as e:
logger.warning(f"Error during token validation: {str(e)}")
raise ValueError(f"Token validation error: {str(e)}") from e
async def close(self):
"""Close any resources."""
pass
def create_auth_provider(config: AuthenticationConfig) -> AuthProvider: def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
"""Factory function to create the appropriate auth provider.""" """Factory function to create the appropriate auth provider."""
provider_config = config.provider_config provider_config = config.provider_config
@ -384,5 +469,7 @@ def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
return OAuth2TokenAuthProvider(provider_config) return OAuth2TokenAuthProvider(provider_config)
elif isinstance(provider_config, GitHubTokenAuthConfig): elif isinstance(provider_config, GitHubTokenAuthConfig):
return GitHubTokenAuthProvider(provider_config) return GitHubTokenAuthProvider(provider_config)
elif isinstance(provider_config, KubernetesAuthProviderConfig):
return KubernetesAuthProvider(provider_config)
else: else:
raise ValueError(f"Unknown authentication provider config type: {type(provider_config)}") raise ValueError(f"Unknown authentication provider config type: {type(provider_config)}")

View file

@ -132,15 +132,17 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
}, },
) )
elif isinstance(exc, ConflictError): elif isinstance(exc, ConflictError):
return HTTPException(status_code=409, detail=str(exc)) return HTTPException(status_code=httpx.codes.CONFLICT, detail=str(exc))
elif isinstance(exc, ResourceNotFoundError): elif isinstance(exc, ResourceNotFoundError):
return HTTPException(status_code=404, detail=str(exc)) return HTTPException(status_code=httpx.codes.NOT_FOUND, detail=str(exc))
elif isinstance(exc, ValueError): elif isinstance(exc, ValueError):
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}") return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
elif isinstance(exc, BadRequestError): elif isinstance(exc, BadRequestError):
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc)) return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
elif isinstance(exc, PermissionError | AccessDeniedError): elif isinstance(exc, PermissionError | AccessDeniedError):
return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}") return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
elif isinstance(exc, ConnectionError | httpx.ConnectError):
return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc))
elif isinstance(exc, asyncio.TimeoutError | TimeoutError): elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}") return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
elif isinstance(exc, NotImplementedError): elif isinstance(exc, NotImplementedError):
@ -513,6 +515,7 @@ def main(args: argparse.Namespace | None = None):
apis_to_serve.add("inspect") apis_to_serve.add("inspect")
apis_to_serve.add("providers") apis_to_serve.add("providers")
apis_to_serve.add("prompts")
for api_str in apis_to_serve: for api_str in apis_to_serve:
api = Api(api_str) api = Api(api_str)

View file

@ -24,6 +24,7 @@ from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models from llama_stack.apis.models import Models
from llama_stack.apis.post_training import PostTraining from llama_stack.apis.post_training import PostTraining
from llama_stack.apis.prompts import Prompts
from llama_stack.apis.providers import Providers from llama_stack.apis.providers import Providers
from llama_stack.apis.safety import Safety from llama_stack.apis.safety import Safety
from llama_stack.apis.scoring import Scoring from llama_stack.apis.scoring import Scoring
@ -37,6 +38,7 @@ from llama_stack.apis.vector_io import VectorIO
from llama_stack.core.datatypes import Provider, StackRunConfig from llama_stack.core.datatypes import Provider, StackRunConfig
from llama_stack.core.distribution import get_provider_registry from llama_stack.core.distribution import get_provider_registry
from llama_stack.core.inspect import DistributionInspectConfig, DistributionInspectImpl from llama_stack.core.inspect import DistributionInspectConfig, DistributionInspectImpl
from llama_stack.core.prompts.prompts import PromptServiceConfig, PromptServiceImpl
from llama_stack.core.providers import ProviderImpl, ProviderImplConfig from llama_stack.core.providers import ProviderImpl, ProviderImplConfig
from llama_stack.core.resolver import ProviderRegistry, resolve_impls from llama_stack.core.resolver import ProviderRegistry, resolve_impls
from llama_stack.core.routing_tables.common import CommonRoutingTableImpl from llama_stack.core.routing_tables.common import CommonRoutingTableImpl
@ -72,6 +74,7 @@ class LlamaStack(
ToolRuntime, ToolRuntime,
RAGToolRuntime, RAGToolRuntime,
Files, Files,
Prompts,
): ):
pass pass
@ -305,6 +308,12 @@ def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConf
) )
impls[Api.providers] = providers_impl impls[Api.providers] = providers_impl
prompts_impl = PromptServiceImpl(
PromptServiceConfig(run_config=run_config),
deps=impls,
)
impls[Api.prompts] = prompts_impl
# Produces a stack of providers for the given run config. Not all APIs may be # Produces a stack of providers for the given run config. Not all APIs may be
# asked for in the run config. # asked for in the run config.
@ -329,6 +338,9 @@ async def construct_stack(
# Add internal implementations after all other providers are resolved # Add internal implementations after all other providers are resolved
add_internal_implementations(impls, run_config) add_internal_implementations(impls, run_config)
if Api.prompts in impls:
await impls[Api.prompts].initialize()
await register_resources(run_config, impls) await register_resources(run_config, impls)
await refresh_registry_once(impls) await refresh_registry_once(impls)

View file

@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
def get_distribution_template() -> DistributionTemplate: def get_distribution_template() -> DistributionTemplate:
template = get_starter_distribution_template() template = get_starter_distribution_template(name="ci-tests")
name = "ci-tests"
template.name = name
template.description = "CI tests for Llama Stack" template.description = "CI tests for Llama Stack"
return template return template

View file

@ -89,28 +89,28 @@ providers:
config: config:
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/faiss_store.db
- provider_id: sqlite-vec - provider_id: sqlite-vec
provider_type: inline::sqlite-vec provider_type: inline::sqlite-vec
config: config:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec.db
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec_registry.db
- provider_id: ${env.MILVUS_URL:+milvus} - provider_id: ${env.MILVUS_URL:+milvus}
provider_type: inline::milvus provider_type: inline::milvus
config: config:
db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/ci-tests}/milvus.db
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/milvus_registry.db
- provider_id: ${env.CHROMADB_URL:+chromadb} - provider_id: ${env.CHROMADB_URL:+chromadb}
provider_type: remote::chromadb provider_type: remote::chromadb
config: config:
url: ${env.CHROMADB_URL:=} url: ${env.CHROMADB_URL:=}
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests/}/chroma_remote_registry.db
- provider_id: ${env.PGVECTOR_DB:+pgvector} - provider_id: ${env.PGVECTOR_DB:+pgvector}
provider_type: remote::pgvector provider_type: remote::pgvector
config: config:
@ -121,15 +121,15 @@ providers:
password: ${env.PGVECTOR_PASSWORD:=} password: ${env.PGVECTOR_PASSWORD:=}
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/pgvector_registry.db
files: files:
- provider_id: meta-reference-files - provider_id: meta-reference-files
provider_type: inline::localfs provider_type: inline::localfs
config: config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files} storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ci-tests/files}
metadata_store: metadata_store:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/files_metadata.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard

View file

@ -89,28 +89,28 @@ providers:
config: config:
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/faiss_store.db
- provider_id: sqlite-vec - provider_id: sqlite-vec
provider_type: inline::sqlite-vec provider_type: inline::sqlite-vec
config: config:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec_registry.db
- provider_id: ${env.MILVUS_URL:+milvus} - provider_id: ${env.MILVUS_URL:+milvus}
provider_type: inline::milvus provider_type: inline::milvus
config: config:
db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/milvus_registry.db
- provider_id: ${env.CHROMADB_URL:+chromadb} - provider_id: ${env.CHROMADB_URL:+chromadb}
provider_type: remote::chromadb provider_type: remote::chromadb
config: config:
url: ${env.CHROMADB_URL:=} url: ${env.CHROMADB_URL:=}
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu/}/chroma_remote_registry.db
- provider_id: ${env.PGVECTOR_DB:+pgvector} - provider_id: ${env.PGVECTOR_DB:+pgvector}
provider_type: remote::pgvector provider_type: remote::pgvector
config: config:
@ -121,15 +121,15 @@ providers:
password: ${env.PGVECTOR_PASSWORD:=} password: ${env.PGVECTOR_PASSWORD:=}
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/pgvector_registry.db
files: files:
- provider_id: meta-reference-files - provider_id: meta-reference-files
provider_type: inline::localfs provider_type: inline::localfs
config: config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files} storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
metadata_store: metadata_store:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/files_metadata.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard

View file

@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
def get_distribution_template() -> DistributionTemplate: def get_distribution_template() -> DistributionTemplate:
template = get_starter_distribution_template() template = get_starter_distribution_template(name="starter-gpu")
name = "starter-gpu"
template.name = name
template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments." template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments."
template.providers["post_training"] = [ template.providers["post_training"] = [

View file

@ -99,9 +99,8 @@ def get_remote_inference_providers() -> list[Provider]:
return inference_providers return inference_providers
def get_distribution_template() -> DistributionTemplate: def get_distribution_template(name: str = "starter") -> DistributionTemplate:
remote_inference_providers = get_remote_inference_providers() remote_inference_providers = get_remote_inference_providers()
name = "starter"
providers = { providers = {
"inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in remote_inference_providers] "inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in remote_inference_providers]

View file

@ -178,9 +178,9 @@ class ReferenceBatchesImpl(Batches):
# TODO: set expiration time for garbage collection # TODO: set expiration time for garbage collection
if endpoint not in ["/v1/chat/completions"]: if endpoint not in ["/v1/chat/completions", "/v1/completions"]:
raise ValueError( raise ValueError(
f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions. Code: invalid_value. Param: endpoint", f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions, /v1/completions. Code: invalid_value. Param: endpoint",
) )
if completion_window != "24h": if completion_window != "24h":
@ -424,13 +424,21 @@ class ReferenceBatchesImpl(Batches):
) )
valid = False valid = False
for param, expected_type, type_string in [ if batch.endpoint == "/v1/chat/completions":
("model", str, "a string"), required_params = [
# messages is specific to /v1/chat/completions ("model", str, "a string"),
# we could skip validating messages here and let inference fail. however, # messages is specific to /v1/chat/completions
# that would be a very expensive way to find out messages is wrong. # we could skip validating messages here and let inference fail. however,
("messages", list, "an array"), # TODO: allow messages to be a string? # that would be a very expensive way to find out messages is wrong.
]: ("messages", list, "an array"), # TODO: allow messages to be a string?
]
else: # /v1/completions
required_params = [
("model", str, "a string"),
("prompt", str, "a string"), # TODO: allow prompt to be a list of strings??
]
for param, expected_type, type_string in required_params:
if param not in body: if param not in body:
errors.append( errors.append(
BatchError( BatchError(
@ -591,20 +599,37 @@ class ReferenceBatchesImpl(Batches):
try: try:
# TODO(SECURITY): review body for security issues # TODO(SECURITY): review body for security issues
request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]] if request.url == "/v1/chat/completions":
chat_response = await self.inference_api.openai_chat_completion(**request.body) request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]]
chat_response = await self.inference_api.openai_chat_completion(**request.body)
# this is for mypy, we don't allow streaming so we'll get the right type # this is for mypy, we don't allow streaming so we'll get the right type
assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method" assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method"
return { return {
"id": request_id, "id": request_id,
"custom_id": request.custom_id, "custom_id": request.custom_id,
"response": { "response": {
"status_code": 200, "status_code": 200,
"request_id": request_id, # TODO: should this be different? "request_id": request_id, # TODO: should this be different?
"body": chat_response.model_dump_json(), "body": chat_response.model_dump_json(),
}, },
} }
else: # /v1/completions
completion_response = await self.inference_api.openai_completion(**request.body)
# this is for mypy, we don't allow streaming so we'll get the right type
assert hasattr(completion_response, "model_dump_json"), (
"Completion response must have model_dump_json method"
)
return {
"id": request_id,
"custom_id": request.custom_id,
"response": {
"status_code": 200,
"request_id": request_id,
"body": completion_response.model_dump_json(),
},
}
except Exception as e: except Exception as e:
logger.info(f"Error processing request {request.custom_id} in batch {batch_id}: {e}") logger.info(f"Error processing request {request.custom_id} in batch {batch_id}: {e}")
return { return {

View file

@ -14,6 +14,6 @@ from .config import RagToolRuntimeConfig
async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]): async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]):
from .memory import MemoryToolRuntimeImpl from .memory import MemoryToolRuntimeImpl
impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference]) impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference], deps[Api.files])
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -5,10 +5,15 @@
# the root directory of this source tree. # the root directory of this source tree.
import asyncio import asyncio
import base64
import io
import mimetypes
import secrets import secrets
import string import string
from typing import Any from typing import Any
import httpx
from fastapi import UploadFile
from pydantic import TypeAdapter from pydantic import TypeAdapter
from llama_stack.apis.common.content_types import ( from llama_stack.apis.common.content_types import (
@ -17,6 +22,7 @@ from llama_stack.apis.common.content_types import (
InterleavedContentItem, InterleavedContentItem,
TextContentItem, TextContentItem,
) )
from llama_stack.apis.files import Files, OpenAIFilePurpose
from llama_stack.apis.inference import Inference from llama_stack.apis.inference import Inference
from llama_stack.apis.tools import ( from llama_stack.apis.tools import (
ListToolDefsResponse, ListToolDefsResponse,
@ -30,13 +36,18 @@ from llama_stack.apis.tools import (
ToolParameter, ToolParameter,
ToolRuntime, ToolRuntime,
) )
from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO from llama_stack.apis.vector_io import (
QueryChunksResponse,
VectorIO,
VectorStoreChunkingStrategyStatic,
VectorStoreChunkingStrategyStaticConfig,
)
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
from llama_stack.providers.utils.memory.vector_store import ( from llama_stack.providers.utils.memory.vector_store import (
content_from_doc, content_from_doc,
make_overlapped_chunks, parse_data_url,
) )
from .config import RagToolRuntimeConfig from .config import RagToolRuntimeConfig
@ -55,10 +66,12 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
config: RagToolRuntimeConfig, config: RagToolRuntimeConfig,
vector_io_api: VectorIO, vector_io_api: VectorIO,
inference_api: Inference, inference_api: Inference,
files_api: Files,
): ):
self.config = config self.config = config
self.vector_io_api = vector_io_api self.vector_io_api = vector_io_api
self.inference_api = inference_api self.inference_api = inference_api
self.files_api = files_api
async def initialize(self): async def initialize(self):
pass pass
@ -78,27 +91,50 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
vector_db_id: str, vector_db_id: str,
chunk_size_in_tokens: int = 512, chunk_size_in_tokens: int = 512,
) -> None: ) -> None:
chunks = [] if not documents:
return
for doc in documents: for doc in documents:
content = await content_from_doc(doc) if isinstance(doc.content, URL):
# TODO: we should add enrichment here as URLs won't be added to the metadata by default if doc.content.uri.startswith("data:"):
chunks.extend( parts = parse_data_url(doc.content.uri)
make_overlapped_chunks( file_data = base64.b64decode(parts["data"]) if parts["is_base64"] else parts["data"].encode()
doc.document_id, mime_type = parts["mimetype"]
content, else:
chunk_size_in_tokens, async with httpx.AsyncClient() as client:
chunk_size_in_tokens // 4, response = await client.get(doc.content.uri)
doc.metadata, file_data = response.content
mime_type = doc.mime_type or response.headers.get("content-type", "application/octet-stream")
else:
content_str = await content_from_doc(doc)
file_data = content_str.encode("utf-8")
mime_type = doc.mime_type or "text/plain"
file_extension = mimetypes.guess_extension(mime_type) or ".txt"
filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
file_obj = io.BytesIO(file_data)
file_obj.name = filename
upload_file = UploadFile(file=file_obj, filename=filename)
created_file = await self.files_api.openai_upload_file(
file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
)
chunking_strategy = VectorStoreChunkingStrategyStatic(
static=VectorStoreChunkingStrategyStaticConfig(
max_chunk_size_tokens=chunk_size_in_tokens,
chunk_overlap_tokens=chunk_size_in_tokens // 4,
) )
) )
if not chunks: await self.vector_io_api.openai_attach_file_to_vector_store(
return vector_store_id=vector_db_id,
file_id=created_file.id,
await self.vector_io_api.insert_chunks( attributes=doc.metadata,
chunks=chunks, chunking_strategy=chunking_strategy,
vector_db_id=vector_db_id, )
)
async def query( async def query(
self, self,
@ -131,8 +167,18 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
for vector_db_id in vector_db_ids for vector_db_id in vector_db_ids
] ]
results: list[QueryChunksResponse] = await asyncio.gather(*tasks) results: list[QueryChunksResponse] = await asyncio.gather(*tasks)
chunks = [c for r in results for c in r.chunks]
scores = [s for r in results for s in r.scores] chunks = []
scores = []
for vector_db_id, result in zip(vector_db_ids, results, strict=False):
for chunk, score in zip(result.chunks, result.scores, strict=False):
if not hasattr(chunk, "metadata") or chunk.metadata is None:
chunk.metadata = {}
chunk.metadata["vector_db_id"] = vector_db_id
chunks.append(chunk)
scores.append(score)
if not chunks: if not chunks:
return RAGQueryResult(content=None) return RAGQueryResult(content=None)
@ -167,6 +213,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
metadata_keys_to_exclude_from_context = [ metadata_keys_to_exclude_from_context = [
"token_count", "token_count",
"metadata_token_count", "metadata_token_count",
"vector_db_id",
] ]
metadata_for_context = {} metadata_for_context = {}
for k in chunk_metadata_keys_to_include_from_context: for k in chunk_metadata_keys_to_include_from_context:
@ -191,6 +238,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
"document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]], "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
"chunks": [c.content for c in chunks[: len(picked)]], "chunks": [c.content for c in chunks[: len(picked)]],
"scores": scores[: len(picked)], "scores": scores[: len(picked)],
"vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],
}, },
) )

View file

@ -30,11 +30,11 @@ from llama_stack.providers.utils.kvstore.api import KVStore
from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
from llama_stack.providers.utils.memory.vector_store import ( from llama_stack.providers.utils.memory.vector_store import (
RERANKER_TYPE_RRF, RERANKER_TYPE_RRF,
RERANKER_TYPE_WEIGHTED,
ChunkForDeletion, ChunkForDeletion,
EmbeddingIndex, EmbeddingIndex,
VectorDBWithIndex, VectorDBWithIndex,
) )
from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator
logger = get_logger(name=__name__, category="vector_io") logger = get_logger(name=__name__, category="vector_io")
@ -66,59 +66,6 @@ def _create_sqlite_connection(db_path):
return connection return connection
def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
"""Normalize scores to [0,1] range using min-max normalization."""
if not scores:
return {}
min_score = min(scores.values())
max_score = max(scores.values())
score_range = max_score - min_score
if score_range > 0:
return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
return dict.fromkeys(scores, 1.0)
def _weighted_rerank(
vector_scores: dict[str, float],
keyword_scores: dict[str, float],
alpha: float = 0.5,
) -> dict[str, float]:
"""ReRanker that uses weighted average of scores."""
all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
normalized_vector_scores = _normalize_scores(vector_scores)
normalized_keyword_scores = _normalize_scores(keyword_scores)
return {
doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
+ ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
for doc_id in all_ids
}
def _rrf_rerank(
vector_scores: dict[str, float],
keyword_scores: dict[str, float],
impact_factor: float = 60.0,
) -> dict[str, float]:
"""ReRanker that uses Reciprocal Rank Fusion."""
# Convert scores to ranks
vector_ranks = {
doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
}
keyword_ranks = {
doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
}
all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
rrf_scores = {}
for doc_id in all_ids:
vector_rank = vector_ranks.get(doc_id, float("inf"))
keyword_rank = keyword_ranks.get(doc_id, float("inf"))
# RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
return rrf_scores
def _make_sql_identifier(name: str) -> str: def _make_sql_identifier(name: str) -> str:
return re.sub(r"[^a-zA-Z0-9_]", "_", name) return re.sub(r"[^a-zA-Z0-9_]", "_", name)
@ -398,14 +345,10 @@ class SQLiteVecIndex(EmbeddingIndex):
for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False) for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
} }
# Combine scores using the specified reranker # Combine scores using the reranking utility
if reranker_type == RERANKER_TYPE_WEIGHTED: combined_scores = WeightedInMemoryAggregator.combine_search_results(
alpha = reranker_params.get("alpha", 0.5) vector_scores, keyword_scores, reranker_type, reranker_params
combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha) )
else:
# Default to RRF for None, RRF, or any unknown types
impact_factor = reranker_params.get("impact_factor", 60.0)
combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
# Sort by combined score and get top k results # Sort by combined score and get top k results
sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

View file

@ -13,7 +13,7 @@ def available_providers() -> list[ProviderSpec]:
InlineProviderSpec( InlineProviderSpec(
api=Api.batches, api=Api.batches,
provider_type="inline::reference", provider_type="inline::reference",
pip_packages=["openai"], pip_packages=[],
module="llama_stack.providers.inline.batches.reference", module="llama_stack.providers.inline.batches.reference",
config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig", config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig",
api_dependencies=[ api_dependencies=[

View file

@ -30,7 +30,7 @@ def available_providers() -> list[ProviderSpec]:
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="huggingface", adapter_type="huggingface",
pip_packages=[ pip_packages=[
"datasets", "datasets>=4.0.0",
], ],
module="llama_stack.providers.remote.datasetio.huggingface", module="llama_stack.providers.remote.datasetio.huggingface",
config_class="llama_stack.providers.remote.datasetio.huggingface.HuggingfaceDatasetIOConfig", config_class="llama_stack.providers.remote.datasetio.huggingface.HuggingfaceDatasetIOConfig",
@ -42,7 +42,7 @@ def available_providers() -> list[ProviderSpec]:
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="nvidia", adapter_type="nvidia",
pip_packages=[ pip_packages=[
"datasets", "datasets>=4.0.0",
], ],
module="llama_stack.providers.remote.datasetio.nvidia", module="llama_stack.providers.remote.datasetio.nvidia",
config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig", config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig",

View file

@ -75,7 +75,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="vllm", adapter_type="vllm",
pip_packages=["openai"], pip_packages=[],
module="llama_stack.providers.remote.inference.vllm", module="llama_stack.providers.remote.inference.vllm",
config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig", config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig",
description="Remote vLLM inference provider for connecting to vLLM servers.", description="Remote vLLM inference provider for connecting to vLLM servers.",
@ -116,7 +116,7 @@ def available_providers() -> list[ProviderSpec]:
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="fireworks", adapter_type="fireworks",
pip_packages=[ pip_packages=[
"fireworks-ai<=0.18.0", "fireworks-ai<=0.17.16",
], ],
module="llama_stack.providers.remote.inference.fireworks", module="llama_stack.providers.remote.inference.fireworks",
config_class="llama_stack.providers.remote.inference.fireworks.FireworksImplConfig", config_class="llama_stack.providers.remote.inference.fireworks.FireworksImplConfig",
@ -151,9 +151,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="databricks", adapter_type="databricks",
pip_packages=[ pip_packages=[],
"openai",
],
module="llama_stack.providers.remote.inference.databricks", module="llama_stack.providers.remote.inference.databricks",
config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig", config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig",
description="Databricks inference provider for running models on Databricks' unified analytics platform.", description="Databricks inference provider for running models on Databricks' unified analytics platform.",
@ -163,9 +161,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="nvidia", adapter_type="nvidia",
pip_packages=[ pip_packages=[],
"openai",
],
module="llama_stack.providers.remote.inference.nvidia", module="llama_stack.providers.remote.inference.nvidia",
config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig", config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig",
description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.", description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.",
@ -175,7 +171,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="runpod", adapter_type="runpod",
pip_packages=["openai"], pip_packages=[],
module="llama_stack.providers.remote.inference.runpod", module="llama_stack.providers.remote.inference.runpod",
config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig", config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig",
description="RunPod inference provider for running models on RunPod's cloud GPU platform.", description="RunPod inference provider for running models on RunPod's cloud GPU platform.",
@ -292,7 +288,7 @@ Available Models:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="watsonx", adapter_type="watsonx",
pip_packages=["ibm_watson_machine_learning"], pip_packages=["ibm_watsonx_ai"],
module="llama_stack.providers.remote.inference.watsonx", module="llama_stack.providers.remote.inference.watsonx",
config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig", config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig",
provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator",

View file

@ -48,7 +48,7 @@ def available_providers() -> list[ProviderSpec]:
InlineProviderSpec( InlineProviderSpec(
api=Api.post_training, api=Api.post_training,
provider_type="inline::huggingface-gpu", provider_type="inline::huggingface-gpu",
pip_packages=["trl", "transformers", "peft", "datasets", "torch"], pip_packages=["trl", "transformers", "peft", "datasets>=4.0.0", "torch"],
module="llama_stack.providers.inline.post_training.huggingface", module="llama_stack.providers.inline.post_training.huggingface",
config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig", config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig",
api_dependencies=[ api_dependencies=[

View file

@ -38,7 +38,7 @@ def available_providers() -> list[ProviderSpec]:
InlineProviderSpec( InlineProviderSpec(
api=Api.scoring, api=Api.scoring,
provider_type="inline::braintrust", provider_type="inline::braintrust",
pip_packages=["autoevals", "openai"], pip_packages=["autoevals"],
module="llama_stack.providers.inline.scoring.braintrust", module="llama_stack.providers.inline.scoring.braintrust",
config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig", config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
api_dependencies=[ api_dependencies=[

View file

@ -32,7 +32,7 @@ def available_providers() -> list[ProviderSpec]:
], ],
module="llama_stack.providers.inline.tool_runtime.rag", module="llama_stack.providers.inline.tool_runtime.rag",
config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig", config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
api_dependencies=[Api.vector_io, Api.inference], api_dependencies=[Api.vector_io, Api.inference, Api.files],
description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.", description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.",
), ),
remote_provider_spec( remote_provider_spec(

View file

@ -5,12 +5,13 @@
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import AnthropicConfig from .config import AnthropicConfig
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class AnthropicInferenceAdapter(LiteLLMOpenAIMixin): class AnthropicInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
def __init__(self, config: AnthropicConfig) -> None: def __init__(self, config: AnthropicConfig) -> None:
LiteLLMOpenAIMixin.__init__( LiteLLMOpenAIMixin.__init__(
self, self,
@ -26,3 +27,8 @@ class AnthropicInferenceAdapter(LiteLLMOpenAIMixin):
async def shutdown(self) -> None: async def shutdown(self) -> None:
await super().shutdown() await super().shutdown()
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self):
return "https://api.anthropic.com/v1"

View file

@ -5,12 +5,13 @@
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import GeminiConfig from .config import GeminiConfig
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class GeminiInferenceAdapter(LiteLLMOpenAIMixin): class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
def __init__(self, config: GeminiConfig) -> None: def __init__(self, config: GeminiConfig) -> None:
LiteLLMOpenAIMixin.__init__( LiteLLMOpenAIMixin.__init__(
self, self,
@ -21,6 +22,11 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
) )
self.config = config self.config = config
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self):
return "https://generativelanguage.googleapis.com/v1beta/openai/"
async def initialize(self) -> None: async def initialize(self) -> None:
await super().initialize() await super().initialize()

View file

@ -4,30 +4,15 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from collections.abc import AsyncIterator
from typing import Any
from openai import AsyncOpenAI
from llama_stack.apis.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAIChoiceDelta,
OpenAIChunkChoice,
OpenAIMessageParam,
OpenAIResponseFormatParam,
OpenAISystemMessageParam,
)
from llama_stack.providers.remote.inference.groq.config import GroqConfig from llama_stack.providers.remote.inference.groq.config import GroqConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_compat import ( from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
prepare_openai_completion_params,
)
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class GroqInferenceAdapter(LiteLLMOpenAIMixin): class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
_config: GroqConfig _config: GroqConfig
def __init__(self, config: GroqConfig): def __init__(self, config: GroqConfig):
@ -40,122 +25,14 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
) )
self.config = config self.config = config
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str:
return f"{self.config.url}/openai/v1"
async def initialize(self): async def initialize(self):
await super().initialize() await super().initialize()
async def shutdown(self): async def shutdown(self):
await super().shutdown() await super().shutdown()
def _get_openai_client(self) -> AsyncOpenAI:
return AsyncOpenAI(
base_url=f"{self.config.url}/openai/v1",
api_key=self.get_api_key(),
)
async def openai_chat_completion(
self,
model: str,
messages: list[OpenAIMessageParam],
frequency_penalty: float | None = None,
function_call: str | dict[str, Any] | None = None,
functions: list[dict[str, Any]] | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_completion_tokens: int | None = None,
max_tokens: int | None = None,
n: int | None = None,
parallel_tool_calls: bool | None = None,
presence_penalty: float | None = None,
response_format: OpenAIResponseFormatParam | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
tool_choice: str | dict[str, Any] | None = None,
tools: list[dict[str, Any]] | None = None,
top_logprobs: int | None = None,
top_p: float | None = None,
user: str | None = None,
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
model_obj = await self.model_store.get_model(model)
# Groq does not support json_schema response format, so we need to convert it to json_object
if response_format and response_format.type == "json_schema":
response_format.type = "json_object"
schema = response_format.json_schema.get("schema", {})
response_format.json_schema = None
json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
if messages and messages[0].role == "system":
messages[0].content = messages[0].content + json_instructions
else:
messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
# Groq returns a 400 error if tools are provided but none are called
# So, set tool_choice to "required" to attempt to force a call
if tools and (not tool_choice or tool_choice == "auto"):
tool_choice = "required"
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
# Groq does not support streaming requests that set response_format
fake_stream = False
if stream and response_format:
params["stream"] = False
fake_stream = True
response = await self._get_openai_client().chat.completions.create(**params)
if fake_stream:
chunk_choices = []
for choice in response.choices:
delta = OpenAIChoiceDelta(
content=choice.message.content,
role=choice.message.role,
tool_calls=choice.message.tool_calls,
)
chunk_choice = OpenAIChunkChoice(
delta=delta,
finish_reason=choice.finish_reason,
index=choice.index,
logprobs=None,
)
chunk_choices.append(chunk_choice)
chunk = OpenAIChatCompletionChunk(
id=response.id,
choices=chunk_choices,
object="chat.completion.chunk",
created=response.created,
model=response.model,
)
async def _fake_stream_generator():
yield chunk
return _fake_stream_generator()
else:
return response

View file

@ -118,10 +118,10 @@ class OllamaInferenceAdapter(
async def initialize(self) -> None: async def initialize(self) -> None:
logger.info(f"checking connectivity to Ollama at `{self.config.url}`...") logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
health_response = await self.health() r = await self.health()
if health_response["status"] == HealthStatus.ERROR: if r["status"] == HealthStatus.ERROR:
logger.warning( logger.warning(
"Ollama Server is not running, make sure to start it using `ollama serve` in a separate terminal" f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
) )
async def should_refresh_models(self) -> bool: async def should_refresh_models(self) -> bool:
@ -156,7 +156,7 @@ class OllamaInferenceAdapter(
), ),
Model( Model(
identifier="nomic-embed-text", identifier="nomic-embed-text",
provider_resource_id="nomic-embed-text", provider_resource_id="nomic-embed-text:latest",
provider_id=provider_id, provider_id=provider_id,
metadata={ metadata={
"embedding_dimension": 768, "embedding_dimension": 768,

View file

@ -4,13 +4,26 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import SambaNovaImplConfig from .config import SambaNovaImplConfig
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin): class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
"""
SambaNova Inference Adapter for Llama Stack.
Note: The inheritance order is important here. OpenAIMixin must come before
LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
is used instead of LiteLLMOpenAIMixin.check_model_availability().
- OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists
- LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM
"""
def __init__(self, config: SambaNovaImplConfig): def __init__(self, config: SambaNovaImplConfig):
self.config = config self.config = config
self.environment_available_models = [] self.environment_available_models = []
@ -24,3 +37,14 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
download_images=True, # SambaNova requires base64 image encoding download_images=True, # SambaNova requires base64 image encoding
json_schema_strict=False, # SambaNova doesn't support strict=True yet json_schema_strict=False, # SambaNova doesn't support strict=True yet
) )
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str:
"""
Get the base URL for OpenAI mixin.
:return: The SambaNova base URL
"""
return self.config.url

View file

@ -6,16 +6,20 @@
from typing import Any from typing import Any
import google.auth.transport.requests
from google.auth import default
from llama_stack.apis.inference import ChatCompletionRequest from llama_stack.apis.inference import ChatCompletionRequest
from llama_stack.providers.utils.inference.litellm_openai_mixin import ( from llama_stack.providers.utils.inference.litellm_openai_mixin import (
LiteLLMOpenAIMixin, LiteLLMOpenAIMixin,
) )
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import VertexAIConfig from .config import VertexAIConfig
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class VertexAIInferenceAdapter(LiteLLMOpenAIMixin): class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
def __init__(self, config: VertexAIConfig) -> None: def __init__(self, config: VertexAIConfig) -> None:
LiteLLMOpenAIMixin.__init__( LiteLLMOpenAIMixin.__init__(
self, self,
@ -27,9 +31,30 @@ class VertexAIInferenceAdapter(LiteLLMOpenAIMixin):
self.config = config self.config = config
def get_api_key(self) -> str: def get_api_key(self) -> str:
# Vertex AI doesn't use API keys, it uses Application Default Credentials """
# Return empty string to let litellm handle authentication via ADC Get an access token for Vertex AI using Application Default Credentials.
return ""
Vertex AI uses ADC instead of API keys. This method obtains an access token
from the default credentials and returns it for use with the OpenAI-compatible client.
"""
try:
# Get default credentials - will read from GOOGLE_APPLICATION_CREDENTIALS
credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
credentials.refresh(google.auth.transport.requests.Request())
return str(credentials.token)
except Exception:
# If we can't get credentials, return empty string to let LiteLLM handle it
# This allows the LiteLLM mixin to work with ADC directly
return ""
def get_base_url(self) -> str:
"""
Get the Vertex AI OpenAI-compatible API base URL.
Returns the Vertex AI OpenAI-compatible endpoint URL.
Source: https://cloud.google.com/vertex-ai/generative-ai/docs/start/openai
"""
return f"https://{self.config.location}-aiplatform.googleapis.com/v1/projects/{self.config.project}/locations/{self.config.location}/endpoints/openapi"
async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]: async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
# Get base parameters from parent # Get base parameters from parent

View file

@ -7,8 +7,8 @@
from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import AsyncGenerator, AsyncIterator
from typing import Any from typing import Any
from ibm_watson_machine_learning.foundation_models import Model from ibm_watsonx_ai.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from openai import AsyncOpenAI from openai import AsyncOpenAI
from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem

View file

@ -4,53 +4,55 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import os
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
class BedrockBaseConfig(BaseModel): class BedrockBaseConfig(BaseModel):
aws_access_key_id: str | None = Field( aws_access_key_id: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_ACCESS_KEY_ID"),
description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID", description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID",
) )
aws_secret_access_key: str | None = Field( aws_secret_access_key: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_SECRET_ACCESS_KEY"),
description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY", description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY",
) )
aws_session_token: str | None = Field( aws_session_token: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_SESSION_TOKEN"),
description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN", description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN",
) )
region_name: str | None = Field( region_name: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_DEFAULT_REGION"),
description="The default AWS Region to use, for example, us-west-1 or us-west-2." description="The default AWS Region to use, for example, us-west-1 or us-west-2."
"Default use environment variable: AWS_DEFAULT_REGION", "Default use environment variable: AWS_DEFAULT_REGION",
) )
profile_name: str | None = Field( profile_name: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_PROFILE"),
description="The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE", description="The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE",
) )
total_max_attempts: int | None = Field( total_max_attempts: int | None = Field(
default=None, default_factory=lambda: int(val) if (val := os.getenv("AWS_MAX_ATTEMPTS")) else None,
description="An integer representing the maximum number of attempts that will be made for a single request, " description="An integer representing the maximum number of attempts that will be made for a single request, "
"including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS", "including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS",
) )
retry_mode: str | None = Field( retry_mode: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_RETRY_MODE"),
description="A string representing the type of retries Boto3 will perform." description="A string representing the type of retries Boto3 will perform."
"Default use environment variable: AWS_RETRY_MODE", "Default use environment variable: AWS_RETRY_MODE",
) )
connect_timeout: float | None = Field( connect_timeout: float | None = Field(
default=60, default_factory=lambda: float(os.getenv("AWS_CONNECT_TIMEOUT", "60")),
description="The time in seconds till a timeout exception is thrown when attempting to make a connection. " description="The time in seconds till a timeout exception is thrown when attempting to make a connection. "
"The default is 60 seconds.", "The default is 60 seconds.",
) )
read_timeout: float | None = Field( read_timeout: float | None = Field(
default=60, default_factory=lambda: float(os.getenv("AWS_READ_TIMEOUT", "60")),
description="The time in seconds till a timeout exception is thrown when attempting to read from a connection." description="The time in seconds till a timeout exception is thrown when attempting to read from a connection."
"The default is 60 seconds.", "The default is 60 seconds.",
) )
session_ttl: int | None = Field( session_ttl: int | None = Field(
default=3600, default_factory=lambda: int(os.getenv("AWS_SESSION_TTL", "3600")),
description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).", description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
) )

View file

@ -4,6 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import asyncio
import base64 import base64
import struct import struct
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@ -43,9 +44,11 @@ class SentenceTransformerEmbeddingMixin:
task_type: EmbeddingTaskType | None = None, task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse: ) -> EmbeddingsResponse:
model = await self.model_store.get_model(model_id) model = await self.model_store.get_model(model_id)
embedding_model = self._load_sentence_transformer_model(model.provider_resource_id) embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id)
embeddings = embedding_model.encode( embeddings = await asyncio.to_thread(
[interleaved_content_as_str(content) for content in contents], show_progress_bar=False embedding_model.encode,
[interleaved_content_as_str(content) for content in contents],
show_progress_bar=False,
) )
return EmbeddingsResponse(embeddings=embeddings) return EmbeddingsResponse(embeddings=embeddings)
@ -64,8 +67,8 @@ class SentenceTransformerEmbeddingMixin:
# Get the model and generate embeddings # Get the model and generate embeddings
model_obj = await self.model_store.get_model(model) model_obj = await self.model_store.get_model(model)
embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id) embedding_model = await self._load_sentence_transformer_model(model_obj.provider_resource_id)
embeddings = embedding_model.encode(input_list, show_progress_bar=False) embeddings = await asyncio.to_thread(embedding_model.encode, input_list, show_progress_bar=False)
# Convert embeddings to the requested format # Convert embeddings to the requested format
data = [] data = []
@ -93,7 +96,7 @@ class SentenceTransformerEmbeddingMixin:
usage=usage, usage=usage,
) )
def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer": async def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
global EMBEDDING_MODELS global EMBEDDING_MODELS
loaded_model = EMBEDDING_MODELS.get(model) loaded_model = EMBEDDING_MODELS.get(model)
@ -101,8 +104,12 @@ class SentenceTransformerEmbeddingMixin:
return loaded_model return loaded_model
log.info(f"Loading sentence transformer for {model}...") log.info(f"Loading sentence transformer for {model}...")
from sentence_transformers import SentenceTransformer
loaded_model = SentenceTransformer(model) def _load_model():
from sentence_transformers import SentenceTransformer
return SentenceTransformer(model)
loaded_model = await asyncio.to_thread(_load_model)
EMBEDDING_MODELS[model] = loaded_model EMBEDDING_MODELS[model] = loaded_model
return loaded_model return loaded_model

View file

@ -3,6 +3,11 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import asyncio
from typing import Any
from sqlalchemy.exc import IntegrityError
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
ListOpenAIChatCompletionResponse, ListOpenAIChatCompletionResponse,
OpenAIChatCompletion, OpenAIChatCompletion,
@ -10,24 +15,43 @@ from llama_stack.apis.inference import (
OpenAIMessageParam, OpenAIMessageParam,
Order, Order,
) )
from llama_stack.core.datatypes import AccessRule from llama_stack.core.datatypes import AccessRule, InferenceStoreConfig
from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR from llama_stack.log import get_logger
from ..sqlstore.api import ColumnDefinition, ColumnType from ..sqlstore.api import ColumnDefinition, ColumnType
from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl from ..sqlstore.sqlstore import SqlStoreConfig, SqlStoreType, sqlstore_impl
logger = get_logger(name=__name__, category="inference_store")
class InferenceStore: class InferenceStore:
def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]): def __init__(
if not sql_store_config: self,
sql_store_config = SqliteSqlStoreConfig( config: InferenceStoreConfig | SqlStoreConfig,
db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(), policy: list[AccessRule],
):
# Handle backward compatibility
if not isinstance(config, InferenceStoreConfig):
# Legacy: SqlStoreConfig passed directly as config
config = InferenceStoreConfig(
sql_store_config=config,
) )
self.sql_store_config = sql_store_config
self.config = config
self.sql_store_config = config.sql_store_config
self.sql_store = None self.sql_store = None
self.policy = policy self.policy = policy
# Disable write queue for SQLite to avoid concurrency issues
self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
# Async write queue and worker control
self._queue: asyncio.Queue[tuple[OpenAIChatCompletion, list[OpenAIMessageParam]]] | None = None
self._worker_tasks: list[asyncio.Task[Any]] = []
self._max_write_queue_size: int = config.max_write_queue_size
self._num_writers: int = max(1, config.num_writers)
async def initialize(self): async def initialize(self):
"""Create the necessary tables if they don't exist.""" """Create the necessary tables if they don't exist."""
self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config)) self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
@ -42,23 +66,109 @@ class InferenceStore:
}, },
) )
if self.enable_write_queue:
self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
for _ in range(self._num_writers):
self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
else:
logger.info("Write queue disabled for SQLite to avoid concurrency issues")
async def shutdown(self) -> None:
if not self._worker_tasks:
return
if self._queue is not None:
await self._queue.join()
for t in self._worker_tasks:
if not t.done():
t.cancel()
for t in self._worker_tasks:
try:
await t
except asyncio.CancelledError:
pass
self._worker_tasks.clear()
async def flush(self) -> None:
"""Wait for all queued writes to complete. Useful for testing."""
if self.enable_write_queue and self._queue is not None:
await self._queue.join()
async def store_chat_completion( async def store_chat_completion(
self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam] self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
) -> None: ) -> None:
if not self.sql_store: if self.enable_write_queue:
if self._queue is None:
raise ValueError("Inference store is not initialized")
try:
self._queue.put_nowait((chat_completion, input_messages))
except asyncio.QueueFull:
logger.warning(
f"Write queue full; adding chat completion id={getattr(chat_completion, 'id', '<unknown>')}"
)
await self._queue.put((chat_completion, input_messages))
else:
await self._write_chat_completion(chat_completion, input_messages)
async def _worker_loop(self) -> None:
assert self._queue is not None
while True:
try:
item = await self._queue.get()
except asyncio.CancelledError:
break
chat_completion, input_messages = item
try:
await self._write_chat_completion(chat_completion, input_messages)
except Exception as e: # noqa: BLE001
logger.error(f"Error writing chat completion: {e}")
finally:
self._queue.task_done()
async def _write_chat_completion(
self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
) -> None:
if self.sql_store is None:
raise ValueError("Inference store is not initialized") raise ValueError("Inference store is not initialized")
data = chat_completion.model_dump() data = chat_completion.model_dump()
record_data = {
"id": data["id"],
"created": data["created"],
"model": data["model"],
"choices": data["choices"],
"input_messages": [message.model_dump() for message in input_messages],
}
await self.sql_store.insert( try:
table="chat_completions", await self.sql_store.insert(
data={ table="chat_completions",
"id": data["id"], data=record_data,
"created": data["created"], )
"model": data["model"], except IntegrityError as e:
"choices": data["choices"], # Duplicate chat completion IDs can be generated during tests especially if they are replaying
"input_messages": [message.model_dump() for message in input_messages], # recorded responses across different tests. No need to warn or error under those circumstances.
}, # In the wild, this is not likely to happen at all (no evidence) so we aren't really hiding any problem.
# Check if it's a unique constraint violation
error_message = str(e.orig) if e.orig else str(e)
if self._is_unique_constraint_error(error_message):
# Update the existing record instead
await self.sql_store.update(table="chat_completions", data=record_data, where={"id": data["id"]})
else:
# Re-raise if it's not a unique constraint error
raise
def _is_unique_constraint_error(self, error_message: str) -> bool:
"""Check if the error is specifically a unique constraint violation."""
error_lower = error_message.lower()
return any(
indicator in error_lower
for indicator in [
"unique constraint failed", # SQLite
"duplicate key", # PostgreSQL
"unique violation", # PostgreSQL alternative
"duplicate entry", # MySQL
]
) )
async def list_chat_completions( async def list_chat_completions(

View file

@ -172,6 +172,20 @@ class AuthorizedSqlStore:
return results.data[0] if results.data else None return results.data[0] if results.data else None
async def update(self, table: str, data: Mapping[str, Any], where: Mapping[str, Any]) -> None:
"""Update rows with automatic access control attribute capture."""
enhanced_data = dict(data)
current_user = get_authenticated_user()
if current_user:
enhanced_data["owner_principal"] = current_user.principal
enhanced_data["access_attributes"] = current_user.attributes
else:
enhanced_data["owner_principal"] = None
enhanced_data["access_attributes"] = None
await self.sql_store.update(table, enhanced_data, where)
async def delete(self, table: str, where: Mapping[str, Any]) -> None: async def delete(self, table: str, where: Mapping[str, Any]) -> None:
"""Delete rows with automatic access control filtering.""" """Delete rows with automatic access control filtering."""
await self.sql_store.delete(table, where) await self.sql_store.delete(table, where)

View file

@ -18,6 +18,7 @@ from functools import wraps
from typing import Any from typing import Any
from llama_stack.apis.telemetry import ( from llama_stack.apis.telemetry import (
Event,
LogSeverity, LogSeverity,
Span, Span,
SpanEndPayload, SpanEndPayload,
@ -98,7 +99,7 @@ class BackgroundLogger:
def __init__(self, api: Telemetry, capacity: int = 100000): def __init__(self, api: Telemetry, capacity: int = 100000):
self.api = api self.api = api
self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity) self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
self.worker_thread = threading.Thread(target=self._process_logs, daemon=True) self.worker_thread = threading.Thread(target=self._worker, daemon=True)
self.worker_thread.start() self.worker_thread.start()
self._last_queue_full_log_time: float = 0.0 self._last_queue_full_log_time: float = 0.0
self._dropped_since_last_notice: int = 0 self._dropped_since_last_notice: int = 0
@ -118,12 +119,16 @@ class BackgroundLogger:
self._last_queue_full_log_time = current_time self._last_queue_full_log_time = current_time
self._dropped_since_last_notice = 0 self._dropped_since_last_notice = 0
def _process_logs(self): def _worker(self):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(self._process_logs())
async def _process_logs(self):
while True: while True:
try: try:
event = self.log_queue.get() event = self.log_queue.get()
# figure out how to use a thread's native loop await self.api.log_event(event)
asyncio.run(self.api.log_event(event))
except Exception: except Exception:
import traceback import traceback
@ -136,6 +141,19 @@ class BackgroundLogger:
self.log_queue.join() self.log_queue.join()
def enqueue_event(event: Event) -> None:
"""Enqueue a telemetry event to the background logger if available.
This provides a non-blocking path for routers and other hot paths to
submit telemetry without awaiting the Telemetry API, reducing contention
with the main event loop.
"""
global BACKGROUND_LOGGER
if BACKGROUND_LOGGER is None:
raise RuntimeError("Telemetry API not initialized")
BACKGROUND_LOGGER.log_event(event)
class TraceContext: class TraceContext:
spans: list[Span] = [] spans: list[Span] = []
@ -256,11 +274,7 @@ class TelemetryHandler(logging.Handler):
if record.module in ("asyncio", "selector_events"): if record.module in ("asyncio", "selector_events"):
return return
global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER global CURRENT_TRACE_CONTEXT
if BACKGROUND_LOGGER is None:
raise RuntimeError("Telemetry API not initialized")
context = CURRENT_TRACE_CONTEXT.get() context = CURRENT_TRACE_CONTEXT.get()
if context is None: if context is None:
return return
@ -269,7 +283,7 @@ class TelemetryHandler(logging.Handler):
if span is None: if span is None:
return return
BACKGROUND_LOGGER.log_event( enqueue_event(
UnstructuredLogEvent( UnstructuredLogEvent(
trace_id=span.trace_id, trace_id=span.trace_id,
span_id=span.span_id, span_id=span.span_id,

View file

@ -67,6 +67,38 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat
raise AuthenticationRequiredError(exc) from exc raise AuthenticationRequiredError(exc) from exc
if i == len(connection_strategies) - 1: if i == len(connection_strategies) - 1:
raise raise
except* httpx.ConnectError as eg:
# Connection refused, server down, network unreachable
if i == len(connection_strategies) - 1:
error_msg = f"Failed to connect to MCP server at {endpoint}: Connection refused"
logger.error(f"MCP connection error: {error_msg}")
raise ConnectionError(error_msg) from eg
else:
logger.warning(
f"failed to connect to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
)
except* httpx.TimeoutException as eg:
# Request timeout, server too slow
if i == len(connection_strategies) - 1:
error_msg = f"MCP server at {endpoint} timed out"
logger.error(f"MCP timeout error: {error_msg}")
raise TimeoutError(error_msg) from eg
else:
logger.warning(
f"MCP server at {endpoint} timed out via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
)
except* httpx.RequestError as eg:
# DNS resolution failures, network errors, invalid URLs
if i == len(connection_strategies) - 1:
# Get the first exception's message for the error string
exc_msg = str(eg.exceptions[0]) if eg.exceptions else "Unknown error"
error_msg = f"Network error connecting to MCP server at {endpoint}: {exc_msg}"
logger.error(f"MCP network error: {error_msg}")
raise ConnectionError(error_msg) from eg
else:
logger.warning(
f"network error connecting to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
)
except* McpError: except* McpError:
if i < len(connection_strategies) - 1: if i < len(connection_strategies) - 1:
logger.warning( logger.warning(

View file

@ -30,6 +30,9 @@ from openai.types.completion_choice import CompletionChoice
CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None
CompletionChoice.model_rebuild() CompletionChoice.model_rebuild()
REPO_ROOT = Path(__file__).parent.parent.parent
DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/recordings"
class InferenceMode(StrEnum): class InferenceMode(StrEnum):
LIVE = "live" LIVE = "live"
@ -51,7 +54,7 @@ def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict
def get_inference_mode() -> InferenceMode: def get_inference_mode() -> InferenceMode:
return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "live").lower()) return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "replay").lower())
def setup_inference_recording(): def setup_inference_recording():
@ -60,28 +63,18 @@ def setup_inference_recording():
to increase their reliability and reduce reliance on expensive, external services. to increase their reliability and reduce reliance on expensive, external services.
Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases. Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases.
Calls to the /models endpoint are not currently trapped. We probably need to add support for this.
Two environment variables are required: Two environment variables are supported:
- LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. Default is 'replay'.
- LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.
The recordings are stored in a SQLite database and a JSON file for each request. The SQLite database is used to The recordings are stored as JSON files.
quickly find the correct recording for a given request. The JSON files are used to store the request and response
bodies.
""" """
mode = get_inference_mode() mode = get_inference_mode()
if mode not in InferenceMode:
raise ValueError(f"Invalid LLAMA_STACK_TEST_INFERENCE_MODE: {mode}. Must be 'live', 'record', or 'replay'")
if mode == InferenceMode.LIVE: if mode == InferenceMode.LIVE:
return None return None
if "LLAMA_STACK_TEST_RECORDING_DIR" not in os.environ: storage_dir = os.environ.get("LLAMA_STACK_TEST_RECORDING_DIR", DEFAULT_STORAGE_DIR)
raise ValueError("LLAMA_STACK_TEST_RECORDING_DIR must be set for recording or replaying")
storage_dir = os.environ["LLAMA_STACK_TEST_RECORDING_DIR"]
return inference_recording(mode=mode, storage_dir=storage_dir) return inference_recording(mode=mode, storage_dir=storage_dir)
@ -134,8 +127,8 @@ class ResponseStorage:
def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]): def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
"""Store a request/response pair.""" """Store a request/response pair."""
# Generate unique response filename # Generate unique response filename
response_file = f"{request_hash[:12]}.json" short_hash = request_hash[:12]
response_path = self.responses_dir / response_file response_file = f"{short_hash}.json"
# Serialize response body if needed # Serialize response body if needed
serialized_response = dict(response) serialized_response = dict(response)
@ -147,6 +140,14 @@ class ResponseStorage:
# Handle single response # Handle single response
serialized_response["body"] = _serialize_response(serialized_response["body"]) serialized_response["body"] = _serialize_response(serialized_response["body"])
# If this is an Ollama /api/tags recording, include models digest in filename to distinguish variants
endpoint = request.get("endpoint")
if endpoint in ("/api/tags", "/v1/models"):
digest = _model_identifiers_digest(endpoint, response)
response_file = f"models-{short_hash}-{digest}.json"
response_path = self.responses_dir / response_file
# Save response to JSON file # Save response to JSON file
with open(response_path, "w") as f: with open(response_path, "w") as f:
json.dump({"request": request, "response": serialized_response}, f, indent=2) json.dump({"request": request, "response": serialized_response}, f, indent=2)
@ -161,19 +162,85 @@ class ResponseStorage:
if not response_path.exists(): if not response_path.exists():
return None return None
with open(response_path) as f: return _recording_from_file(response_path)
data = json.load(f)
# Deserialize response body if needed def _model_list_responses(self, short_hash: str) -> list[dict[str, Any]]:
if "response" in data and "body" in data["response"]: results: list[dict[str, Any]] = []
if isinstance(data["response"]["body"], list): for path in self.responses_dir.glob(f"models-{short_hash}-*.json"):
# Handle streaming responses data = _recording_from_file(path)
data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]] results.append(data)
return results
def _recording_from_file(response_path) -> dict[str, Any]:
with open(response_path) as f:
data = json.load(f)
# Deserialize response body if needed
if "response" in data and "body" in data["response"]:
if isinstance(data["response"]["body"], list):
# Handle streaming responses
data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]]
else:
# Handle single response
data["response"]["body"] = _deserialize_response(data["response"]["body"])
return cast(dict[str, Any], data)
def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
def _extract_model_identifiers():
"""Extract a stable set of identifiers for model-list endpoints.
Supported endpoints:
- '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
- '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ]
Returns a list of unique identifiers or None if structure doesn't match.
"""
body = response["body"]
if endpoint == "/api/tags":
items = body.get("models")
idents = [m.model for m in items]
else:
items = body.get("data")
idents = [m.id for m in items]
return sorted(set(idents))
identifiers = _extract_model_identifiers()
return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
"""Return a single, unioned recording for supported model-list endpoints."""
seen: dict[str, dict[str, Any]] = {}
for rec in records:
body = rec["response"]["body"]
if endpoint == "/api/tags":
items = body.models
elif endpoint == "/v1/models":
items = body.data
else:
items = []
for m in items:
if endpoint == "/v1/models":
key = m.id
else: else:
# Handle single response key = m.model
data["response"]["body"] = _deserialize_response(data["response"]["body"]) seen[key] = m
return cast(dict[str, Any], data) ordered = [seen[k] for k in sorted(seen.keys())]
canonical = records[0]
canonical_req = canonical.get("request", {})
if isinstance(canonical_req, dict):
canonical_req["endpoint"] = endpoint
if endpoint == "/v1/models":
body = {"data": ordered, "object": "list"}
else:
from ollama import ListResponse
body = ListResponse(models=ordered)
return {"request": canonical_req, "response": {"body": body, "is_streaming": False}}
async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs): async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
@ -195,8 +262,6 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
raise ValueError(f"Unknown client type: {client_type}") raise ValueError(f"Unknown client type: {client_type}")
url = base_url.rstrip("/") + endpoint url = base_url.rstrip("/") + endpoint
# Normalize request for matching
method = "POST" method = "POST"
headers = {} headers = {}
body = kwargs body = kwargs
@ -204,7 +269,12 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
request_hash = normalize_request(method, url, headers, body) request_hash = normalize_request(method, url, headers, body)
if _current_mode == InferenceMode.REPLAY: if _current_mode == InferenceMode.REPLAY:
recording = _current_storage.find_recording(request_hash) # Special handling for model-list endpoints: return union of all responses
if endpoint in ("/api/tags", "/v1/models"):
records = _current_storage._model_list_responses(request_hash[:12])
recording = _combine_model_list_responses(endpoint, records)
else:
recording = _current_storage.find_recording(request_hash)
if recording: if recording:
response_body = recording["response"]["body"] response_body = recording["response"]["body"]
@ -222,7 +292,7 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
f"No recorded response found for request hash: {request_hash}\n" f"No recorded response found for request hash: {request_hash}\n"
f"Request: {method} {url} {body}\n" f"Request: {method} {url} {body}\n"
f"Model: {body.get('model', 'unknown')}\n" f"Model: {body.get('model', 'unknown')}\n"
f"To record this response, run with LLAMA_STACK_INFERENCE_MODE=record" f"To record this response, run with LLAMA_STACK_TEST_INFERENCE_MODE=record"
) )
elif _current_mode == InferenceMode.RECORD: elif _current_mode == InferenceMode.RECORD:
@ -274,12 +344,14 @@ def patch_inference_clients():
from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
from openai.resources.completions import AsyncCompletions from openai.resources.completions import AsyncCompletions
from openai.resources.embeddings import AsyncEmbeddings from openai.resources.embeddings import AsyncEmbeddings
from openai.resources.models import AsyncModels
# Store original methods for both OpenAI and Ollama clients # Store original methods for both OpenAI and Ollama clients
_original_methods = { _original_methods = {
"chat_completions_create": AsyncChatCompletions.create, "chat_completions_create": AsyncChatCompletions.create,
"completions_create": AsyncCompletions.create, "completions_create": AsyncCompletions.create,
"embeddings_create": AsyncEmbeddings.create, "embeddings_create": AsyncEmbeddings.create,
"models_list": AsyncModels.list,
"ollama_generate": OllamaAsyncClient.generate, "ollama_generate": OllamaAsyncClient.generate,
"ollama_chat": OllamaAsyncClient.chat, "ollama_chat": OllamaAsyncClient.chat,
"ollama_embed": OllamaAsyncClient.embed, "ollama_embed": OllamaAsyncClient.embed,
@ -304,10 +376,16 @@ def patch_inference_clients():
_original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
) )
async def patched_models_list(self, *args, **kwargs):
return await _patched_inference_method(
_original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
)
# Apply OpenAI patches # Apply OpenAI patches
AsyncChatCompletions.create = patched_chat_completions_create AsyncChatCompletions.create = patched_chat_completions_create
AsyncCompletions.create = patched_completions_create AsyncCompletions.create = patched_completions_create
AsyncEmbeddings.create = patched_embeddings_create AsyncEmbeddings.create = patched_embeddings_create
AsyncModels.list = patched_models_list
# Create patched methods for Ollama client # Create patched methods for Ollama client
async def patched_ollama_generate(self, *args, **kwargs): async def patched_ollama_generate(self, *args, **kwargs):
@ -361,11 +439,13 @@ def unpatch_inference_clients():
from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
from openai.resources.completions import AsyncCompletions from openai.resources.completions import AsyncCompletions
from openai.resources.embeddings import AsyncEmbeddings from openai.resources.embeddings import AsyncEmbeddings
from openai.resources.models import AsyncModels
# Restore OpenAI client methods # Restore OpenAI client methods
AsyncChatCompletions.create = _original_methods["chat_completions_create"] AsyncChatCompletions.create = _original_methods["chat_completions_create"]
AsyncCompletions.create = _original_methods["completions_create"] AsyncCompletions.create = _original_methods["completions_create"]
AsyncEmbeddings.create = _original_methods["embeddings_create"] AsyncEmbeddings.create = _original_methods["embeddings_create"]
AsyncModels.list = _original_methods["models_list"]
# Restore Ollama client methods if they were patched # Restore Ollama client methods if they were patched
OllamaAsyncClient.generate = _original_methods["ollama_generate"] OllamaAsyncClient.generate = _original_methods["ollama_generate"]
@ -379,16 +459,10 @@ def unpatch_inference_clients():
@contextmanager @contextmanager
def inference_recording(mode: str = "live", storage_dir: str | Path | None = None) -> Generator[None, None, None]: def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Generator[None, None, None]:
"""Context manager for inference recording/replaying.""" """Context manager for inference recording/replaying."""
global _current_mode, _current_storage global _current_mode, _current_storage
# Set defaults
if storage_dir is None:
storage_dir_path = Path.home() / ".llama" / "recordings"
else:
storage_dir_path = Path(storage_dir)
# Store previous state # Store previous state
prev_mode = _current_mode prev_mode = _current_mode
prev_storage = _current_storage prev_storage = _current_storage
@ -397,7 +471,9 @@ def inference_recording(mode: str = "live", storage_dir: str | Path | None = Non
_current_mode = mode _current_mode = mode
if mode in ["record", "replay"]: if mode in ["record", "replay"]:
_current_storage = ResponseStorage(storage_dir_path) if storage_dir is None:
raise ValueError("storage_dir is required for record and replay modes")
_current_storage = ResponseStorage(Path(storage_dir))
patch_inference_clients() patch_inference_clients()
yield yield

View file

@ -10,7 +10,7 @@
"dependencies": { "dependencies": {
"@radix-ui/react-collapsible": "^1.1.12", "@radix-ui/react-collapsible": "^1.1.12",
"@radix-ui/react-dialog": "^1.1.13", "@radix-ui/react-dialog": "^1.1.13",
"@radix-ui/react-dropdown-menu": "^2.1.14", "@radix-ui/react-dropdown-menu": "^2.1.16",
"@radix-ui/react-select": "^2.2.5", "@radix-ui/react-select": "^2.2.5",
"@radix-ui/react-separator": "^1.1.7", "@radix-ui/react-separator": "^1.1.7",
"@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-slot": "^1.2.3",
@ -18,18 +18,18 @@
"class-variance-authority": "^0.7.1", "class-variance-authority": "^0.7.1",
"clsx": "^2.1.1", "clsx": "^2.1.1",
"framer-motion": "^12.23.12", "framer-motion": "^12.23.12",
"llama-stack-client": "^0.2.20", "llama-stack-client": "^0.2.21",
"lucide-react": "^0.510.0", "lucide-react": "^0.542.0",
"next": "15.3.3", "next": "15.3.3",
"next-auth": "^4.24.11", "next-auth": "^4.24.11",
"next-themes": "^0.4.6", "next-themes": "^0.4.6",
"react": "^19.0.0", "react": "^19.0.0",
"react-dom": "^19.0.0", "react-dom": "^19.1.1",
"react-markdown": "^10.1.0", "react-markdown": "^10.1.0",
"remark-gfm": "^4.0.1", "remark-gfm": "^4.0.1",
"remeda": "^2.30.0", "remeda": "^2.30.0",
"shiki": "^1.29.2", "shiki": "^1.29.2",
"sonner": "^2.0.6", "sonner": "^2.0.7",
"tailwind-merge": "^3.3.1" "tailwind-merge": "^3.3.1"
}, },
"devDependencies": { "devDependencies": {
@ -2066,12 +2066,35 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/@radix-ui/react-arrow": { "node_modules/@radix-ui/react-arrow": {
"version": "1.1.6", "version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.6.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
"integrity": "sha512-2JMfHJf/eVnwq+2dewT3C0acmCWD3XiVA1Da+jTDqo342UlU13WvXtqHhG+yJw5JeQmu4ue2eMy6gcEArLBlcw==", "integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/react-primitive": "2.1.2" "@radix-ui/react-primitive": "2.1.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-arrow/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
}, },
"peerDependencies": { "peerDependencies": {
"@types/react": "*", "@types/react": "*",
@ -2172,15 +2195,15 @@
} }
}, },
"node_modules/@radix-ui/react-collection": { "node_modules/@radix-ui/react-collection": {
"version": "1.1.6", "version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.6.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz",
"integrity": "sha512-PbhRFK4lIEw9ADonj48tiYWzkllz81TM7KVYyyMMw2cwHO7D5h4XKEblL8NlaRisTK3QTe6tBEhDccFUryxHBQ==", "integrity": "sha512-Fh9rGN0MoI4ZFUNyfFVNU4y9LUz93u9/0K+yLgA2bwRojxM8JU1DyvvMBabnZPBgMWREAJvU2jjVzq+LrFUglw==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-slot": "1.2.2" "@radix-ui/react-slot": "1.2.3"
}, },
"peerDependencies": { "peerDependencies": {
"@types/react": "*", "@types/react": "*",
@ -2197,21 +2220,26 @@
} }
} }
}, },
"node_modules/@radix-ui/react-collection/node_modules/@radix-ui/react-slot": { "node_modules/@radix-ui/react-collection/node_modules/@radix-ui/react-primitive": {
"version": "1.2.2", "version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.2.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==", "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/react-compose-refs": "1.1.2" "@radix-ui/react-slot": "1.2.3"
}, },
"peerDependencies": { "peerDependencies": {
"@types/react": "*", "@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" "@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
}, },
"peerDependenciesMeta": { "peerDependenciesMeta": {
"@types/react": { "@types/react": {
"optional": true "optional": true
},
"@types/react-dom": {
"optional": true
} }
} }
}, },
@ -2342,17 +2370,17 @@
} }
}, },
"node_modules/@radix-ui/react-dropdown-menu": { "node_modules/@radix-ui/react-dropdown-menu": {
"version": "2.1.14", "version": "2.1.16",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dropdown-menu/-/react-dropdown-menu-2.1.14.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-dropdown-menu/-/react-dropdown-menu-2.1.16.tgz",
"integrity": "sha512-lzuyNjoWOoaMFE/VC5FnAAYM16JmQA8ZmucOXtlhm2kKR5TSU95YLAueQ4JYuRmUJmBvSqXaVFGIfuukybwZJQ==", "integrity": "sha512-1PLGQEynI/3OX/ftV54COn+3Sud/Mn8vALg2rWnBLnRaGtJDduNW/22XjlGgPdpcIbiQxjKtb7BkcjP00nqfJw==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/primitive": "1.1.2", "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-id": "1.1.1", "@radix-ui/react-id": "1.1.1",
"@radix-ui/react-menu": "2.1.14", "@radix-ui/react-menu": "2.1.16",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-controllable-state": "1.2.2" "@radix-ui/react-use-controllable-state": "1.2.2"
}, },
"peerDependencies": { "peerDependencies": {
@ -2370,6 +2398,35 @@
} }
} }
}, },
"node_modules/@radix-ui/react-dropdown-menu/node_modules/@radix-ui/primitive": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
"integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT"
},
"node_modules/@radix-ui/react-dropdown-menu/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-focus-guards": { "node_modules/@radix-ui/react-focus-guards": {
"version": "1.1.2", "version": "1.1.2",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.2.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.2.tgz",
@ -2429,26 +2486,26 @@
} }
}, },
"node_modules/@radix-ui/react-menu": { "node_modules/@radix-ui/react-menu": {
"version": "2.1.14", "version": "2.1.16",
"resolved": "https://registry.npmjs.org/@radix-ui/react-menu/-/react-menu-2.1.14.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-menu/-/react-menu-2.1.16.tgz",
"integrity": "sha512-0zSiBAIFq9GSKoSH5PdEaQeRB3RnEGxC+H2P0egtnKoKKLNBH8VBHyVO6/jskhjAezhOIplyRUj7U2lds9A+Yg==", "integrity": "sha512-72F2T+PLlphrqLcAotYPp0uJMr5SjP5SL01wfEspJbru5Zs5vQaSHb4VB3ZMJPimgHHCHG7gMOeOB9H3Hdmtxg==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/primitive": "1.1.2", "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-collection": "1.1.6", "@radix-ui/react-collection": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-direction": "1.1.1", "@radix-ui/react-direction": "1.1.1",
"@radix-ui/react-dismissable-layer": "1.1.9", "@radix-ui/react-dismissable-layer": "1.1.11",
"@radix-ui/react-focus-guards": "1.1.2", "@radix-ui/react-focus-guards": "1.1.3",
"@radix-ui/react-focus-scope": "1.1.6", "@radix-ui/react-focus-scope": "1.1.7",
"@radix-ui/react-id": "1.1.1", "@radix-ui/react-id": "1.1.1",
"@radix-ui/react-popper": "1.2.6", "@radix-ui/react-popper": "1.2.8",
"@radix-ui/react-portal": "1.1.8", "@radix-ui/react-portal": "1.1.9",
"@radix-ui/react-presence": "1.1.4", "@radix-ui/react-presence": "1.1.5",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-roving-focus": "1.1.9", "@radix-ui/react-roving-focus": "1.1.11",
"@radix-ui/react-slot": "1.2.2", "@radix-ui/react-slot": "1.2.3",
"@radix-ui/react-use-callback-ref": "1.1.1", "@radix-ui/react-use-callback-ref": "1.1.1",
"aria-hidden": "^1.2.4", "aria-hidden": "^1.2.4",
"react-remove-scroll": "^2.6.3" "react-remove-scroll": "^2.6.3"
@ -2468,14 +2525,44 @@
} }
} }
}, },
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-slot": { "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/primitive": {
"version": "1.2.2", "version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.2.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
"integrity": "sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==", "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT"
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-dismissable-layer": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
"integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/react-compose-refs": "1.1.2" "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-escape-keydown": "1.1.1"
}, },
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-focus-guards": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz",
"integrity": "sha512-0rFg/Rj2Q62NCm62jZw0QX7a3sz6QCQU0LpZdNrJX8byRGaGVTqbrW9jAoIAHyMQqsNpeZ81YgSizOt5WXq0Pw==",
"license": "MIT",
"peerDependencies": { "peerDependencies": {
"@types/react": "*", "@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
@ -2486,17 +2573,113 @@
} }
} }
}, },
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-focus-scope": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.1.7.tgz",
"integrity": "sha512-t2ODlkXBQyn7jkl6TNaw/MtVEVvIGelJDCG41Okq/KwUsJBwQ4XVZsHAVUkK4mBv3ewiAS3PGuUWuY2BoK4ZUw==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-portal": {
"version": "1.1.9",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
"integrity": "sha512-bpIxvq03if6UNwXZ+HTK71JLh4APvnXntDc6XOX8UVq4XQOVl7lwok0AvIl+b8zgCw3fSaVTZMpAPPagXbKmHQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-layout-effect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-presence": {
"version": "1.1.5",
"resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.5.tgz",
"integrity": "sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-use-layout-effect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-popper": { "node_modules/@radix-ui/react-popper": {
"version": "1.2.6", "version": "1.2.8",
"resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.6.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
"integrity": "sha512-7iqXaOWIjDBfIG7aq8CUEeCSsQMLFdn7VEE8TaFz704DtEzpPHR7w/uuzRflvKgltqSAImgcmxQ7fFX3X7wasg==", "integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@floating-ui/react-dom": "^2.0.0", "@floating-ui/react-dom": "^2.0.0",
"@radix-ui/react-arrow": "1.1.6", "@radix-ui/react-arrow": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1", "@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-layout-effect": "1.1.1", "@radix-ui/react-use-layout-effect": "1.1.1",
"@radix-ui/react-use-rect": "1.1.1", "@radix-ui/react-use-rect": "1.1.1",
@ -2518,6 +2701,29 @@
} }
} }
}, },
"node_modules/@radix-ui/react-popper/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-portal": { "node_modules/@radix-ui/react-portal": {
"version": "1.1.8", "version": "1.1.8",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.8.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.8.tgz",
@ -2608,18 +2814,18 @@
} }
}, },
"node_modules/@radix-ui/react-roving-focus": { "node_modules/@radix-ui/react-roving-focus": {
"version": "1.1.9", "version": "1.1.11",
"resolved": "https://registry.npmjs.org/@radix-ui/react-roving-focus/-/react-roving-focus-1.1.9.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-roving-focus/-/react-roving-focus-1.1.11.tgz",
"integrity": "sha512-ZzrIFnMYHHCNqSNCsuN6l7wlewBEq0O0BCSBkabJMFXVO51LRUTq71gLP1UxFvmrXElqmPjA5VX7IqC9VpazAQ==", "integrity": "sha512-7A6S9jSgm/S+7MdtNDSb+IU859vQqJ/QAtcYQcfFC6W8RS4IxIZDldLR0xqCFZ6DCyrQLjLPsxtTNch5jVA4lA==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/primitive": "1.1.2", "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-collection": "1.1.6", "@radix-ui/react-collection": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-direction": "1.1.1", "@radix-ui/react-direction": "1.1.1",
"@radix-ui/react-id": "1.1.1", "@radix-ui/react-id": "1.1.1",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1", "@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-controllable-state": "1.2.2" "@radix-ui/react-use-controllable-state": "1.2.2"
}, },
@ -2638,6 +2844,35 @@
} }
} }
}, },
"node_modules/@radix-ui/react-roving-focus/node_modules/@radix-ui/primitive": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
"integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT"
},
"node_modules/@radix-ui/react-roving-focus/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-select": { "node_modules/@radix-ui/react-select": {
"version": "2.2.5", "version": "2.2.5",
"resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz",
@ -2681,55 +2916,6 @@
} }
} }
}, },
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-arrow": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
"integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-primitive": "2.1.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-collection": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz",
"integrity": "sha512-Fh9rGN0MoI4ZFUNyfFVNU4y9LUz93u9/0K+yLgA2bwRojxM8JU1DyvvMBabnZPBgMWREAJvU2jjVzq+LrFUglw==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": { "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": {
"version": "1.1.10", "version": "1.1.10",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz",
@ -2965,29 +3151,6 @@
"integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-arrow": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
"integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-primitive": "2.1.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-dismissable-layer": { "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-dismissable-layer": {
"version": "1.1.11", "version": "1.1.11",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
@ -3015,38 +3178,6 @@
} }
} }
}, },
"node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-popper": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
"integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
"license": "MIT",
"dependencies": {
"@floating-ui/react-dom": "^2.0.0",
"@radix-ui/react-arrow": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-layout-effect": "1.1.1",
"@radix-ui/react-use-rect": "1.1.1",
"@radix-ui/react-use-size": "1.1.1",
"@radix-ui/rect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-portal": { "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-portal": {
"version": "1.1.9", "version": "1.1.9",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
@ -3447,6 +3578,13 @@
"tailwindcss": "4.1.6" "tailwindcss": "4.1.6"
} }
}, },
"node_modules/@tailwindcss/node/node_modules/tailwindcss": {
"version": "4.1.6",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
"dev": true,
"license": "MIT"
},
"node_modules/@tailwindcss/oxide": { "node_modules/@tailwindcss/oxide": {
"version": "4.1.6", "version": "4.1.6",
"resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz", "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz",
@ -3707,6 +3845,13 @@
"tailwindcss": "4.1.6" "tailwindcss": "4.1.6"
} }
}, },
"node_modules/@tailwindcss/postcss/node_modules/tailwindcss": {
"version": "4.1.6",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
"dev": true,
"license": "MIT"
},
"node_modules/@testing-library/dom": { "node_modules/@testing-library/dom": {
"version": "10.4.1", "version": "10.4.1",
"resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz", "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz",
@ -4079,9 +4224,9 @@
} }
}, },
"node_modules/@types/react-dom": { "node_modules/@types/react-dom": {
"version": "19.1.5", "version": "19.1.9",
"resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.5.tgz", "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.9.tgz",
"integrity": "sha512-CMCjrWucUBZvohgZxkjd6S9h0nZxXjzus6yDfUb+xLxYM7VvjKNH1tQrE9GWLql1XoOP4/Ds3bwFqShHUYraGg==", "integrity": "sha512-qXRuZaOsAdXKFyOhRBg6Lqqc0yay13vN7KrIg4L7N4aaHN68ma9OK3NE1BoDFgFOTfM7zg+3/8+2n8rLUH3OKQ==",
"devOptional": true, "devOptional": true,
"license": "MIT", "license": "MIT",
"peerDependencies": { "peerDependencies": {
@ -10147,9 +10292,9 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/llama-stack-client": { "node_modules/llama-stack-client": {
"version": "0.2.20", "version": "0.2.21",
"resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.20.tgz", "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.21.tgz",
"integrity": "sha512-1vD5nizTX5JEW8TADxKgy/P1W8YZoPSpdnmfxbdYbWgpQ3BWtbvLS6jmDk7VwVA5fRC4895VfHsRDfS1liHarw==", "integrity": "sha512-rjU2Vx5xStxDYavU8K1An/SYXiQQjroLcK98B+p0Paz/a7OgRao2S0YwvThJjPUyChY4fO03UIXP9LpmHqlXWQ==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@types/node": "^18.11.18", "@types/node": "^18.11.18",
@ -10240,9 +10385,9 @@
"license": "ISC" "license": "ISC"
}, },
"node_modules/lucide-react": { "node_modules/lucide-react": {
"version": "0.510.0", "version": "0.542.0",
"resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.510.0.tgz", "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.542.0.tgz",
"integrity": "sha512-p8SQRAMVh7NhsAIETokSqDrc5CHnDLbV29mMnzaXx+Vc/hnqQzwI2r0FMWCcoTXnbw2KEjy48xwpGdEL+ck06Q==", "integrity": "sha512-w3hD8/SQB7+lzU2r4VdFyzzOzKnUjTZIF/MQJGSSvni7Llewni4vuViRppfRAa2guOsY5k4jZyxw/i9DQHv+dw==",
"license": "ISC", "license": "ISC",
"peerDependencies": { "peerDependencies": {
"react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0" "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
@ -12448,24 +12593,24 @@
} }
}, },
"node_modules/react": { "node_modules/react": {
"version": "19.1.0", "version": "19.1.1",
"resolved": "https://registry.npmjs.org/react/-/react-19.1.0.tgz", "resolved": "https://registry.npmjs.org/react/-/react-19.1.1.tgz",
"integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==", "integrity": "sha512-w8nqGImo45dmMIfljjMwOGtbmC/mk4CMYhWIicdSflH91J9TyCyczcPFXJzrZ/ZXcgGRFeP6BU0BEJTw6tZdfQ==",
"license": "MIT", "license": "MIT",
"engines": { "engines": {
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/react-dom": { "node_modules/react-dom": {
"version": "19.1.0", "version": "19.1.1",
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.0.tgz", "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.1.tgz",
"integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==", "integrity": "sha512-Dlq/5LAZgF0Gaz6yiqZCf6VCcZs1ghAJyrsu84Q/GT0gV+mCxbfmKNoGRKBYMJ8IEdGPqu49YWXD02GCknEDkw==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"scheduler": "^0.26.0" "scheduler": "^0.26.0"
}, },
"peerDependencies": { "peerDependencies": {
"react": "^19.1.0" "react": "^19.1.1"
} }
}, },
"node_modules/react-is": { "node_modules/react-is": {
@ -13285,9 +13430,9 @@
} }
}, },
"node_modules/sonner": { "node_modules/sonner": {
"version": "2.0.6", "version": "2.0.7",
"resolved": "https://registry.npmjs.org/sonner/-/sonner-2.0.6.tgz", "resolved": "https://registry.npmjs.org/sonner/-/sonner-2.0.7.tgz",
"integrity": "sha512-yHFhk8T/DK3YxjFQXIrcHT1rGEeTLliVzWbO0xN8GberVun2RiBnxAjXAYpZrqwEVHBG9asI/Li8TAAhN9m59Q==", "integrity": "sha512-W6ZN4p58k8aDKA4XPcx2hpIQXBRAgyiWVkYhT7CvK6D3iAu7xjvVyhQHg2/iaKJZ1XVJ4r7XuwGL+WGEK37i9w==",
"license": "MIT", "license": "MIT",
"peerDependencies": { "peerDependencies": {
"react": "^18.0.0 || ^19.0.0 || ^19.0.0-rc", "react": "^18.0.0 || ^19.0.0 || ^19.0.0-rc",
@ -13712,9 +13857,9 @@
} }
}, },
"node_modules/tailwindcss": { "node_modules/tailwindcss": {
"version": "4.1.6", "version": "4.1.13",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz", "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.13.tgz",
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==", "integrity": "sha512-i+zidfmTqtwquj4hMEwdjshYYgMbOrPzb9a0M3ZgNa0JMoZeFC6bxZvO8yr8ozS6ix2SDz0+mvryPeBs2TFE+w==",
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },

View file

@ -15,7 +15,7 @@
"dependencies": { "dependencies": {
"@radix-ui/react-collapsible": "^1.1.12", "@radix-ui/react-collapsible": "^1.1.12",
"@radix-ui/react-dialog": "^1.1.13", "@radix-ui/react-dialog": "^1.1.13",
"@radix-ui/react-dropdown-menu": "^2.1.14", "@radix-ui/react-dropdown-menu": "^2.1.16",
"@radix-ui/react-select": "^2.2.5", "@radix-ui/react-select": "^2.2.5",
"@radix-ui/react-separator": "^1.1.7", "@radix-ui/react-separator": "^1.1.7",
"@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-slot": "^1.2.3",
@ -23,18 +23,18 @@
"class-variance-authority": "^0.7.1", "class-variance-authority": "^0.7.1",
"clsx": "^2.1.1", "clsx": "^2.1.1",
"framer-motion": "^12.23.12", "framer-motion": "^12.23.12",
"llama-stack-client": "^0.2.20", "llama-stack-client": "^0.2.21",
"lucide-react": "^0.510.0", "lucide-react": "^0.542.0",
"next": "15.3.3", "next": "15.3.3",
"next-auth": "^4.24.11", "next-auth": "^4.24.11",
"next-themes": "^0.4.6", "next-themes": "^0.4.6",
"react": "^19.0.0", "react": "^19.0.0",
"react-dom": "^19.0.0", "react-dom": "^19.1.1",
"react-markdown": "^10.1.0", "react-markdown": "^10.1.0",
"remark-gfm": "^4.0.1", "remark-gfm": "^4.0.1",
"remeda": "^2.30.0", "remeda": "^2.30.0",
"shiki": "^1.29.2", "shiki": "^1.29.2",
"sonner": "^2.0.6", "sonner": "^2.0.7",
"tailwind-merge": "^3.3.1" "tailwind-merge": "^3.3.1"
}, },
"devDependencies": { "devDependencies": {

View file

@ -7,7 +7,7 @@ required-version = ">=0.7.0"
[project] [project]
name = "llama_stack" name = "llama_stack"
version = "0.2.20" version = "0.2.21"
authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }] authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
description = "Llama Stack" description = "Llama Stack"
readme = "README.md" readme = "README.md"
@ -31,9 +31,8 @@ dependencies = [
"huggingface-hub>=0.34.0,<1.0", "huggingface-hub>=0.34.0,<1.0",
"jinja2>=3.1.6", "jinja2>=3.1.6",
"jsonschema", "jsonschema",
"llama-stack-client>=0.2.20", "llama-stack-client>=0.2.21",
"llama-api-client>=0.1.2", "openai>=1.100.0", # for expires_after support
"openai>=1.99.6",
"prompt-toolkit", "prompt-toolkit",
"python-dotenv", "python-dotenv",
"python-jose[cryptography]", "python-jose[cryptography]",
@ -56,7 +55,7 @@ dependencies = [
ui = [ ui = [
"streamlit", "streamlit",
"pandas", "pandas",
"llama-stack-client>=0.2.20", "llama-stack-client>=0.2.21",
"streamlit-option-menu", "streamlit-option-menu",
] ]
@ -81,7 +80,6 @@ dev = [
unit = [ unit = [
"sqlite-vec", "sqlite-vec",
"ollama", "ollama",
"openai",
"aiosqlite", "aiosqlite",
"aiohttp", "aiohttp",
"psycopg2-binary>=2.9.0", "psycopg2-binary>=2.9.0",
@ -93,7 +91,7 @@ unit = [
"sqlalchemy[asyncio]>=2.0.41", "sqlalchemy[asyncio]>=2.0.41",
"blobfile", "blobfile",
"faiss-cpu", "faiss-cpu",
"pymilvus>=2.5.12", "pymilvus>=2.6.1",
"milvus-lite>=2.5.0", "milvus-lite>=2.5.0",
"litellm", "litellm",
"together", "together",
@ -106,7 +104,6 @@ unit = [
# separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra # separately. If you are using "uv" to execute your tests, you can use the "--group" flag to specify extra
# dependencies. # dependencies.
test = [ test = [
"openai>=1.100.0", # for expires_after support
"aiosqlite", "aiosqlite",
"aiohttp", "aiohttp",
"torch>=2.6.0", "torch>=2.6.0",
@ -115,13 +112,13 @@ test = [
"psycopg2-binary>=2.9.0", "psycopg2-binary>=2.9.0",
"pypdf", "pypdf",
"mcp", "mcp",
"datasets", "datasets>=4.0.0",
"autoevals", "autoevals",
"transformers", "transformers",
"sqlalchemy", "sqlalchemy",
"sqlalchemy[asyncio]>=2.0.41", "sqlalchemy[asyncio]>=2.0.41",
"requests", "requests",
"pymilvus>=2.5.12", "pymilvus>=2.6.1",
"milvus-lite>=2.5.0", "milvus-lite>=2.5.0",
"weaviate-client>=4.16.4", "weaviate-client>=4.16.4",
] ]
@ -146,7 +143,7 @@ docs = [
] ]
codegen = ["rich", "pydantic", "jinja2>=3.1.6"] codegen = ["rich", "pydantic", "jinja2>=3.1.6"]
benchmark = [ benchmark = [
"locust>=2.37.14", "locust>=2.39.1",
] ]
[project.urls] [project.urls]

71
scripts/get_setup_env.py Executable file
View file

@ -0,0 +1,71 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Small helper script to extract environment variables from a test setup.
Used by integration-tests.sh to set environment variables before starting the server.
"""
import argparse
import sys
from tests.integration.suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS
def get_setup_env_vars(setup_name, suite_name=None):
"""
Get environment variables for a setup, with optional suite default fallback.
Args:
setup_name: Name of the setup (e.g., 'ollama', 'gpt')
suite_name: Optional suite name to get default setup if setup_name is None
Returns:
Dictionary of environment variables
"""
# If no setup specified, try to get default from suite
if not setup_name and suite_name:
suite = SUITE_DEFINITIONS.get(suite_name)
if suite and suite.default_setup:
setup_name = suite.default_setup
if not setup_name:
return {}
setup = SETUP_DEFINITIONS.get(setup_name)
if not setup:
print(
f"Error: Unknown setup '{setup_name}'. Available: {', '.join(sorted(SETUP_DEFINITIONS.keys()))}",
file=sys.stderr,
)
sys.exit(1)
return setup.env
def main():
parser = argparse.ArgumentParser(description="Extract environment variables from a test setup")
parser.add_argument("--setup", help="Setup name (e.g., ollama, gpt)")
parser.add_argument("--suite", help="Suite name to get default setup from if --setup not provided")
parser.add_argument("--format", choices=["bash", "json"], default="bash", help="Output format (default: bash)")
args = parser.parse_args()
env_vars = get_setup_env_vars(args.setup, args.suite)
if args.format == "bash":
# Output as bash export statements
for key, value in env_vars.items():
print(f"export {key}='{value}'")
elif args.format == "json":
import json
print(json.dumps(env_vars))
if __name__ == "__main__":
main()

View file

@ -14,8 +14,8 @@ set -euo pipefail
# Default values # Default values
BRANCH="" BRANCH=""
TEST_SUBDIRS="" TEST_SUBDIRS=""
TEST_PROVIDER="ollama" TEST_SETUP="ollama"
RUN_VISION_TESTS=false TEST_SUITE="base"
TEST_PATTERN="" TEST_PATTERN=""
# Help function # Help function
@ -27,24 +27,24 @@ Trigger the integration test recording workflow remotely. This way you do not ne
OPTIONS: OPTIONS:
-b, --branch BRANCH Branch to run the workflow on (defaults to current branch) -b, --branch BRANCH Branch to run the workflow on (defaults to current branch)
-s, --test-subdirs DIRS Comma-separated list of test subdirectories to run (REQUIRED) -t, --suite SUITE Test suite to use: base, responses, vision, etc. (default: base)
-p, --test-provider PROVIDER Test provider to use: vllm or ollama (default: ollama) -p, --setup SETUP Test setup to use: vllm, ollama, gpt, etc. (default: ollama)
-v, --run-vision-tests Include vision tests in the recording -s, --subdirs DIRS Comma-separated list of test subdirectories to run (overrides suite)
-k, --test-pattern PATTERN Regex pattern to pass to pytest -k -k, --pattern PATTERN Regex pattern to pass to pytest -k
-h, --help Show this help message -h, --help Show this help message
EXAMPLES: EXAMPLES:
# Record tests for current branch with agents subdirectory # Record tests for current branch with agents subdirectory
$0 --test-subdirs "agents" $0 --subdirs "agents"
# Record tests for specific branch with vision tests # Record tests for specific branch with vision tests
$0 -b my-feature-branch --test-subdirs "inference" --run-vision-tests $0 -b my-feature-branch --suite vision
# Record multiple test subdirectories with specific provider # Record multiple test subdirectories with specific setup
$0 --test-subdirs "agents,inference" --test-provider vllm $0 --subdirs "agents,inference" --setup vllm
# Record tests matching a specific pattern # Record tests matching a specific pattern
$0 --test-subdirs "inference" --test-pattern "test_streaming" $0 --subdirs "inference" --pattern "test_streaming"
EOF EOF
} }
@ -63,19 +63,19 @@ while [[ $# -gt 0 ]]; do
BRANCH="$2" BRANCH="$2"
shift 2 shift 2
;; ;;
-s|--test-subdirs) -s|--subdirs)
TEST_SUBDIRS="$2" TEST_SUBDIRS="$2"
shift 2 shift 2
;; ;;
-p|--test-provider) -p|--setup)
TEST_PROVIDER="$2" TEST_SETUP="$2"
shift 2 shift 2
;; ;;
-v|--run-vision-tests) -t|--suite)
RUN_VISION_TESTS=true TEST_SUITE="$2"
shift shift 2
;; ;;
-k|--test-pattern) -k|--pattern)
TEST_PATTERN="$2" TEST_PATTERN="$2"
shift 2 shift 2
;; ;;
@ -92,22 +92,17 @@ while [[ $# -gt 0 ]]; do
done done
# Validate required parameters # Validate required parameters
if [[ -z "$TEST_SUBDIRS" ]]; then if [[ -z "$TEST_SUBDIRS" && -z "$TEST_SUITE" ]]; then
echo "Error: --test-subdirs is required" echo "Error: --subdirs or --suite is required"
echo "Please specify which test subdirectories to run, e.g.:" echo "Please specify which test subdirectories to run or test suite to use, e.g.:"
echo " $0 --test-subdirs \"agents,inference\"" echo " $0 --subdirs \"agents,inference\""
echo " $0 --test-subdirs \"inference\" --run-vision-tests" echo " $0 --suite vision"
echo "" echo ""
exit 1 exit 1
fi fi
# Validate test provider # Validate test setup (optional - setups are validated by the workflow itself)
if [[ "$TEST_PROVIDER" != "vllm" && "$TEST_PROVIDER" != "ollama" ]]; then # Common setups: ollama, vllm, gpt, etc.
echo "❌ Error: Invalid test provider '$TEST_PROVIDER'"
echo " Supported providers: vllm, ollama"
echo " Example: $0 --test-subdirs \"agents\" --test-provider vllm"
exit 1
fi
# Check if required tools are installed # Check if required tools are installed
if ! command -v gh &> /dev/null; then if ! command -v gh &> /dev/null; then
@ -237,22 +232,25 @@ fi
# Build the workflow dispatch command # Build the workflow dispatch command
echo "Triggering integration test recording workflow..." echo "Triggering integration test recording workflow..."
echo "Branch: $BRANCH" echo "Branch: $BRANCH"
echo "Test provider: $TEST_PROVIDER" echo "Test setup: $TEST_SETUP"
echo "Test subdirs: $TEST_SUBDIRS" echo "Test subdirs: $TEST_SUBDIRS"
echo "Run vision tests: $RUN_VISION_TESTS" echo "Test suite: $TEST_SUITE"
echo "Test pattern: ${TEST_PATTERN:-"(none)"}" echo "Test pattern: ${TEST_PATTERN:-"(none)"}"
echo "" echo ""
# Prepare inputs for gh workflow run # Prepare inputs for gh workflow run
INPUTS="-f test-subdirs='$TEST_SUBDIRS'" INPUTS=
if [[ -n "$TEST_PROVIDER" ]]; then if [[ -n "$TEST_SUBDIRS" ]]; then
INPUTS="$INPUTS -f test-provider='$TEST_PROVIDER'" INPUTS="$INPUTS -f subdirs='$TEST_SUBDIRS'"
fi fi
if [[ "$RUN_VISION_TESTS" == "true" ]]; then if [[ -n "$TEST_SETUP" ]]; then
INPUTS="$INPUTS -f run-vision-tests=true" INPUTS="$INPUTS -f test-setup='$TEST_SETUP'"
fi
if [[ -n "$TEST_SUITE" ]]; then
INPUTS="$INPUTS -f suite='$TEST_SUITE'"
fi fi
if [[ -n "$TEST_PATTERN" ]]; then if [[ -n "$TEST_PATTERN" ]]; then
INPUTS="$INPUTS -f test-pattern='$TEST_PATTERN'" INPUTS="$INPUTS -f pattern='$TEST_PATTERN'"
fi fi
# Run the workflow # Run the workflow

View file

@ -13,10 +13,10 @@ set -euo pipefail
# Default values # Default values
STACK_CONFIG="" STACK_CONFIG=""
PROVIDER="" TEST_SUITE="base"
TEST_SETUP=""
TEST_SUBDIRS="" TEST_SUBDIRS=""
TEST_PATTERN="" TEST_PATTERN=""
RUN_VISION_TESTS="false"
INFERENCE_MODE="replay" INFERENCE_MODE="replay"
EXTRA_PARAMS="" EXTRA_PARAMS=""
@ -27,25 +27,30 @@ Usage: $0 [OPTIONS]
Options: Options:
--stack-config STRING Stack configuration to use (required) --stack-config STRING Stack configuration to use (required)
--provider STRING Provider to use (ollama, vllm, etc.) (required) --suite STRING Test suite to run (default: 'base')
--test-subdirs STRING Comma-separated list of test subdirectories to run (default: 'inference') --setup STRING Test setup (models, env) to use (e.g., 'ollama', 'ollama-vision', 'gpt', 'vllm')
--run-vision-tests Run vision tests instead of regular tests
--inference-mode STRING Inference mode: record or replay (default: replay) --inference-mode STRING Inference mode: record or replay (default: replay)
--test-pattern STRING Regex pattern to pass to pytest -k --subdirs STRING Comma-separated list of test subdirectories to run (overrides suite)
--pattern STRING Regex pattern to pass to pytest -k
--help Show this help message --help Show this help message
Suites are defined in tests/integration/suites.py and define which tests to run.
Setups are defined in tests/integration/setups.py and provide global configuration (models, env).
You can also specify subdirectories (of tests/integration) to select tests from, which will override the suite.
Examples: Examples:
# Basic inference tests with ollama # Basic inference tests with ollama
$0 --stack-config server:ci-tests --provider ollama $0 --stack-config server:ci-tests --suite base --setup ollama
# Multiple test directories with vllm # Multiple test directories with vllm
$0 --stack-config server:ci-tests --provider vllm --test-subdirs 'inference,agents' $0 --stack-config server:ci-tests --subdirs 'inference,agents' --setup vllm
# Vision tests with ollama # Vision tests with ollama
$0 --stack-config server:ci-tests --provider ollama --run-vision-tests $0 --stack-config server:ci-tests --suite vision # default setup for this suite is ollama-vision
# Record mode for updating test recordings # Record mode for updating test recordings
$0 --stack-config server:ci-tests --provider ollama --inference-mode record $0 --stack-config server:ci-tests --suite base --inference-mode record
EOF EOF
} }
@ -56,23 +61,23 @@ while [[ $# -gt 0 ]]; do
STACK_CONFIG="$2" STACK_CONFIG="$2"
shift 2 shift 2
;; ;;
--provider) --setup)
PROVIDER="$2" TEST_SETUP="$2"
shift 2 shift 2
;; ;;
--test-subdirs) --subdirs)
TEST_SUBDIRS="$2" TEST_SUBDIRS="$2"
shift 2 shift 2
;; ;;
--run-vision-tests) --suite)
RUN_VISION_TESTS="true" TEST_SUITE="$2"
shift shift 2
;; ;;
--inference-mode) --inference-mode)
INFERENCE_MODE="$2" INFERENCE_MODE="$2"
shift 2 shift 2
;; ;;
--test-pattern) --pattern)
TEST_PATTERN="$2" TEST_PATTERN="$2"
shift 2 shift 2
;; ;;
@ -96,18 +101,23 @@ if [[ -z "$STACK_CONFIG" ]]; then
exit 1 exit 1
fi fi
if [[ -z "$PROVIDER" ]]; then if [[ -z "$TEST_SETUP" && -n "$TEST_SUBDIRS" ]]; then
echo "Error: --provider is required" echo "Error: --test-setup is required when --test-subdirs is provided"
usage usage
exit 1 exit 1
fi fi
if [[ -z "$TEST_SUITE" && -z "$TEST_SUBDIRS" ]]; then
echo "Error: --test-suite or --test-subdirs is required"
exit 1
fi
echo "=== Llama Stack Integration Test Runner ===" echo "=== Llama Stack Integration Test Runner ==="
echo "Stack Config: $STACK_CONFIG" echo "Stack Config: $STACK_CONFIG"
echo "Provider: $PROVIDER" echo "Setup: $TEST_SETUP"
echo "Test Subdirs: $TEST_SUBDIRS"
echo "Vision Tests: $RUN_VISION_TESTS"
echo "Inference Mode: $INFERENCE_MODE" echo "Inference Mode: $INFERENCE_MODE"
echo "Test Suite: $TEST_SUITE"
echo "Test Subdirs: $TEST_SUBDIRS"
echo "Test Pattern: $TEST_PATTERN" echo "Test Pattern: $TEST_PATTERN"
echo "" echo ""
@ -122,31 +132,28 @@ echo ""
# Set environment variables # Set environment variables
export LLAMA_STACK_CLIENT_TIMEOUT=300 export LLAMA_STACK_CLIENT_TIMEOUT=300
export LLAMA_STACK_TEST_INFERENCE_MODE="$INFERENCE_MODE"
# Configure provider-specific settings
if [[ "$PROVIDER" == "ollama" ]]; then
export OLLAMA_URL="http://0.0.0.0:11434"
export TEXT_MODEL="ollama/llama3.2:3b-instruct-fp16"
export SAFETY_MODEL="ollama/llama-guard3:1b"
EXTRA_PARAMS="--safety-shield=llama-guard"
else
export VLLM_URL="http://localhost:8000/v1"
export TEXT_MODEL="vllm/meta-llama/Llama-3.2-1B-Instruct"
EXTRA_PARAMS=""
fi
THIS_DIR=$(dirname "$0") THIS_DIR=$(dirname "$0")
if [[ -n "$TEST_SETUP" ]]; then
EXTRA_PARAMS="--setup=$TEST_SETUP"
fi
# Apply setup-specific environment variables (needed for server startup and tests)
echo "=== Applying Setup Environment Variables ==="
# the server needs this
export LLAMA_STACK_TEST_INFERENCE_MODE="$INFERENCE_MODE"
SETUP_ENV=$(PYTHONPATH=$THIS_DIR/.. python "$THIS_DIR/get_setup_env.py" --suite "$TEST_SUITE" --setup "$TEST_SETUP" --format bash)
echo "Setting up environment variables:"
echo "$SETUP_ENV"
eval "$SETUP_ENV"
echo ""
ROOT_DIR="$THIS_DIR/.." ROOT_DIR="$THIS_DIR/.."
cd $ROOT_DIR cd $ROOT_DIR
# Set recording directory
if [[ "$RUN_VISION_TESTS" == "true" ]]; then
export LLAMA_STACK_TEST_RECORDING_DIR="tests/integration/recordings/vision"
else
export LLAMA_STACK_TEST_RECORDING_DIR="tests/integration/recordings"
fi
# check if "llama" and "pytest" are available. this script does not use `uv run` given # check if "llama" and "pytest" are available. this script does not use `uv run` given
# it can be used in a pre-release environment where we have not been able to tell # it can be used in a pre-release environment where we have not been able to tell
# uv about pre-release dependencies properly (yet). # uv about pre-release dependencies properly (yet).
@ -162,6 +169,18 @@ fi
# Start Llama Stack Server if needed # Start Llama Stack Server if needed
if [[ "$STACK_CONFIG" == *"server:"* ]]; then if [[ "$STACK_CONFIG" == *"server:"* ]]; then
stop_server() {
echo "Stopping Llama Stack Server..."
pids=$(lsof -i :8321 | awk 'NR>1 {print $2}')
if [[ -n "$pids" ]]; then
echo "Killing Llama Stack Server processes: $pids"
kill -9 $pids
else
echo "No Llama Stack Server processes found ?!"
fi
echo "Llama Stack Server stopped"
}
# check if server is already running # check if server is already running
if curl -s http://localhost:8321/v1/health 2>/dev/null | grep -q "OK"; then if curl -s http://localhost:8321/v1/health 2>/dev/null | grep -q "OK"; then
echo "Llama Stack Server is already running, skipping start" echo "Llama Stack Server is already running, skipping start"
@ -185,14 +204,16 @@ if [[ "$STACK_CONFIG" == *"server:"* ]]; then
done done
echo "" echo ""
fi fi
trap stop_server EXIT ERR INT TERM
fi fi
# Run tests # Run tests
echo "=== Running Integration Tests ===" echo "=== Running Integration Tests ==="
EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag" EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
# Additional exclusions for vllm provider # Additional exclusions for vllm setup
if [[ "$PROVIDER" == "vllm" ]]; then if [[ "$TEST_SETUP" == "vllm" ]]; then
EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls" EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
fi fi
@ -201,86 +222,50 @@ if [[ -n "$TEST_PATTERN" ]]; then
PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN" PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"
fi fi
# Run vision tests if specified
if [[ "$RUN_VISION_TESTS" == "true" ]]; then
echo "Running vision tests..."
set +e
pytest -s -v tests/integration/inference/test_vision_inference.py \
--stack-config="$STACK_CONFIG" \
-k "$PYTEST_PATTERN" \
--vision-model=ollama/llama3.2-vision:11b \
--embedding-model=sentence-transformers/all-MiniLM-L6-v2 \
--color=yes $EXTRA_PARAMS \
--capture=tee-sys
exit_code=$?
set -e
if [ $exit_code -eq 0 ]; then
echo "✅ Vision tests completed successfully"
elif [ $exit_code -eq 5 ]; then
echo "⚠️ No vision tests collected (pattern matched no tests)"
else
echo "❌ Vision tests failed"
exit 1
fi
exit 0
fi
# Run regular tests
if [[ -z "$TEST_SUBDIRS" ]]; then
TEST_SUBDIRS=$(find tests/integration -maxdepth 1 -mindepth 1 -type d |
sed 's|tests/integration/||' |
grep -Ev "^(__pycache__|fixtures|test_cases|recordings|non_ci|post_training)$" |
sort)
fi
echo "Test subdirs to run: $TEST_SUBDIRS" echo "Test subdirs to run: $TEST_SUBDIRS"
# Collect all test files for the specified test types if [[ -n "$TEST_SUBDIRS" ]]; then
TEST_FILES="" # Collect all test files for the specified test types
for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do TEST_FILES=""
# Skip certain test types for vllm provider for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do
if [[ "$PROVIDER" == "vllm" ]]; then if [[ -d "tests/integration/$test_subdir" ]]; then
if [[ "$test_subdir" == "safety" ]] || [[ "$test_subdir" == "post_training" ]] || [[ "$test_subdir" == "tool_runtime" ]]; then # Find all Python test files in this directory
echo "Skipping $test_subdir for vllm provider" test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py")
continue if [[ -n "$test_files" ]]; then
TEST_FILES="$TEST_FILES $test_files"
echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files"
fi
else
echo "Warning: Directory tests/integration/$test_subdir does not exist"
fi fi
done
if [[ -z "$TEST_FILES" ]]; then
echo "No test files found for the specified test types"
exit 1
fi fi
if [[ "$STACK_CONFIG" != *"server:"* ]] && [[ "$test_subdir" == "batches" ]]; then echo ""
echo "Skipping $test_subdir for library client until types are supported" echo "=== Running all collected tests in a single pytest command ==="
continue echo "Total test files: $(echo $TEST_FILES | wc -w)"
fi
if [[ -d "tests/integration/$test_subdir" ]]; then PYTEST_TARGET="$TEST_FILES"
# Find all Python test files in this directory else
test_files=$(find tests/integration/$test_subdir -name "test_*.py" -o -name "*_test.py") PYTEST_TARGET="tests/integration/"
if [[ -n "$test_files" ]]; then EXTRA_PARAMS="$EXTRA_PARAMS --suite=$TEST_SUITE"
TEST_FILES="$TEST_FILES $test_files"
echo "Added test files from $test_subdir: $(echo $test_files | wc -w) files"
fi
else
echo "Warning: Directory tests/integration/$test_subdir does not exist"
fi
done
if [[ -z "$TEST_FILES" ]]; then
echo "No test files found for the specified test types"
exit 1
fi fi
echo ""
echo "=== Running all collected tests in a single pytest command ==="
echo "Total test files: $(echo $TEST_FILES | wc -w)"
set +e set +e
pytest -s -v $TEST_FILES \ set -x
pytest -s -v $PYTEST_TARGET \
--stack-config="$STACK_CONFIG" \ --stack-config="$STACK_CONFIG" \
--inference-mode="$INFERENCE_MODE" \
-k "$PYTEST_PATTERN" \ -k "$PYTEST_PATTERN" \
--text-model="$TEXT_MODEL" \ $EXTRA_PARAMS \
--embedding-model=sentence-transformers/all-MiniLM-L6-v2 \ --color=yes \
--color=yes $EXTRA_PARAMS \
--capture=tee-sys --capture=tee-sys
exit_code=$? exit_code=$?
set +x
set -e set -e
if [ $exit_code -eq 0 ]; then if [ $exit_code -eq 0 ]; then

View file

@ -38,26 +38,15 @@ For running integration tests, you must provide a few things:
- a distribution name (e.g., `starter`) or a path to a `run.yaml` file - a distribution name (e.g., `starter`) or a path to a `run.yaml` file
- a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface. - a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface.
- Whether you are using replay or live mode for inference. This is specified with the LLAMA_STACK_TEST_INFERENCE_MODE environment variable. The default mode currently is "live" -- that is certainly surprising, but we will fix this soon.
- Any API keys you need to use should be set in the environment, or can be passed in with the --env option. - Any API keys you need to use should be set in the environment, or can be passed in with the --env option.
You can run the integration tests in replay mode with: You can run the integration tests in replay mode with:
```bash ```bash
# Run all tests with existing recordings # Run all tests with existing recordings
LLAMA_STACK_TEST_INFERENCE_MODE=replay \
LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
uv run --group test \ uv run --group test \
pytest -sv tests/integration/ --stack-config=starter pytest -sv tests/integration/ --stack-config=starter
``` ```
If you don't specify LLAMA_STACK_TEST_INFERENCE_MODE, by default it will be in "live" mode -- that is, it will make real API calls.
```bash
# Test against live APIs
FIREWORKS_API_KEY=your_key pytest -sv tests/integration/inference --stack-config=starter
```
### Re-recording tests ### Re-recording tests
#### Local Re-recording (Manual Setup Required) #### Local Re-recording (Manual Setup Required)
@ -66,7 +55,6 @@ If you want to re-record tests locally, you can do so with:
```bash ```bash
LLAMA_STACK_TEST_INFERENCE_MODE=record \ LLAMA_STACK_TEST_INFERENCE_MODE=record \
LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
uv run --group test \ uv run --group test \
pytest -sv tests/integration/ --stack-config=starter -k "<appropriate test name>" pytest -sv tests/integration/ --stack-config=starter -k "<appropriate test name>"
``` ```
@ -89,7 +77,7 @@ You must be careful when re-recording. CI workflows assume a specific setup for
./scripts/github/schedule-record-workflow.sh --test-subdirs "agents,inference" ./scripts/github/schedule-record-workflow.sh --test-subdirs "agents,inference"
# Record with vision tests enabled # Record with vision tests enabled
./scripts/github/schedule-record-workflow.sh --test-subdirs "inference" --run-vision-tests ./scripts/github/schedule-record-workflow.sh --test-suite vision
# Record with specific provider # Record with specific provider
./scripts/github/schedule-record-workflow.sh --test-subdirs "agents" --test-provider vllm ./scripts/github/schedule-record-workflow.sh --test-subdirs "agents" --test-provider vllm

View file

@ -6,9 +6,7 @@ Integration tests verify complete workflows across different providers using Lla
```bash ```bash
# Run all integration tests with existing recordings # Run all integration tests with existing recordings
LLAMA_STACK_TEST_INFERENCE_MODE=replay \ uv run --group test \
LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
uv run --group test \
pytest -sv tests/integration/ --stack-config=starter pytest -sv tests/integration/ --stack-config=starter
``` ```
@ -42,6 +40,37 @@ Model parameters can be influenced by the following options:
Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped Each of these are comma-separated lists and can be used to generate multiple parameter combinations. Note that tests will be skipped
if no model is specified. if no model is specified.
### Suites and Setups
- `--suite`: single named suite that narrows which tests are collected.
- Available suites:
- `base`: collects most tests (excludes responses and post_training)
- `responses`: collects tests under `tests/integration/responses` (needs strong tool-calling models)
- `vision`: collects only `tests/integration/inference/test_vision_inference.py`
- `--setup`: global configuration that can be used with any suite. Setups prefill model/env defaults; explicit CLI flags always win.
- Available setups:
- `ollama`: Local Ollama provider with lightweight models (sets OLLAMA_URL, uses llama3.2:3b-instruct-fp16)
- `vllm`: VLLM provider for efficient local inference (sets VLLM_URL, uses Llama-3.2-1B-Instruct)
- `gpt`: OpenAI GPT models for high-quality responses (uses gpt-4o)
- `claude`: Anthropic Claude models for high-quality responses (uses claude-3-5-sonnet)
Examples
```bash
# Fast responses run with a strong tool-calling model
pytest -s -v tests/integration --stack-config=server:starter --suite=responses --setup=gpt
# Fast single-file vision run with Ollama defaults
pytest -s -v tests/integration --stack-config=server:starter --suite=vision --setup=ollama
# Base suite with VLLM for performance
pytest -s -v tests/integration --stack-config=server:starter --suite=base --setup=vllm
# Override a default from setup
pytest -s -v tests/integration --stack-config=server:starter \
--suite=responses --setup=gpt --embedding-model=text-embedding-3-small
```
## Examples ## Examples
### Testing against a Server ### Testing against a Server
@ -98,29 +127,24 @@ pytest -s -v tests/integration/vector_io/ \
The testing system supports three modes controlled by environment variables: The testing system supports three modes controlled by environment variables:
### LIVE Mode (Default) ### REPLAY Mode (Default)
Tests make real API calls: Uses cached responses instead of making API calls:
```bash ```bash
LLAMA_STACK_TEST_INFERENCE_MODE=live pytest tests/integration/ pytest tests/integration/
``` ```
### RECORD Mode ### RECORD Mode
Captures API interactions for later replay: Captures API interactions for later replay:
```bash ```bash
LLAMA_STACK_TEST_INFERENCE_MODE=record \ pytest tests/integration/inference/test_new_feature.py --inference-mode=record
LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
pytest tests/integration/inference/test_new_feature.py
``` ```
### REPLAY Mode ### LIVE Mode
Uses cached responses instead of making API calls: Tests make real API calls (but not recorded):
```bash ```bash
LLAMA_STACK_TEST_INFERENCE_MODE=replay \ pytest tests/integration/ --inference-mode=live
LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
pytest tests/integration/
``` ```
Note that right now you must specify the recording directory. This is because different tests use different recording directories and we don't (yet) have a fool-proof way to map a test to a recording directory. We are working on this. By default, the recording directory is `tests/integration/recordings`. You can override this by setting the `LLAMA_STACK_TEST_RECORDING_DIR` environment variable.
## Managing Recordings ## Managing Recordings
@ -138,16 +162,14 @@ cat recordings/responses/abc123.json | jq '.'
#### Remote Re-recording (Recommended) #### Remote Re-recording (Recommended)
Use the automated workflow script for easier re-recording: Use the automated workflow script for easier re-recording:
```bash ```bash
./scripts/github/schedule-record-workflow.sh --test-subdirs "inference,agents" ./scripts/github/schedule-record-workflow.sh --subdirs "inference,agents"
``` ```
See the [main testing guide](../README.md#remote-re-recording-recommended) for full details. See the [main testing guide](../README.md#remote-re-recording-recommended) for full details.
#### Local Re-recording #### Local Re-recording
```bash ```bash
# Re-record specific tests # Re-record specific tests
LLAMA_STACK_TEST_INFERENCE_MODE=record \ pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py --inference-mode=record
LLAMA_STACK_TEST_RECORDING_DIR=tests/integration/recordings \
pytest -s -v --stack-config=server:starter tests/integration/inference/test_modified.py
``` ```
Note that when re-recording tests, you must use a Stack pointing to a server (i.e., `server:starter`). This subtlety exists because the set of tests run in server are a superset of the set of tests run in the library client. Note that when re-recording tests, you must use a Stack pointing to a server (i.e., `server:starter`). This subtlety exists because the set of tests run in server are a superset of the set of tests run in the library client.

View file

@ -268,3 +268,58 @@ class TestBatchesIntegration:
deleted_error_file = openai_client.files.delete(final_batch.error_file_id) deleted_error_file = openai_client.files.delete(final_batch.error_file_id)
assert deleted_error_file.deleted, f"Error file {final_batch.error_file_id} was not deleted successfully" assert deleted_error_file.deleted, f"Error file {final_batch.error_file_id} was not deleted successfully"
def test_batch_e2e_completions(self, openai_client, batch_helper, text_model_id):
"""Run an end-to-end batch with a single successful text completion request."""
request_body = {"model": text_model_id, "prompt": "Say completions", "max_tokens": 20}
batch_requests = [
{
"custom_id": "success-1",
"method": "POST",
"url": "/v1/completions",
"body": request_body,
}
]
with batch_helper.create_file(batch_requests) as uploaded_file:
batch = openai_client.batches.create(
input_file_id=uploaded_file.id,
endpoint="/v1/completions",
completion_window="24h",
metadata={"test": "e2e_completions_success"},
)
final_batch = batch_helper.wait_for(
batch.id,
max_wait_time=3 * 60,
expected_statuses={"completed"},
timeout_action="skip",
)
assert final_batch.status == "completed"
assert final_batch.request_counts is not None
assert final_batch.request_counts.total == 1
assert final_batch.request_counts.completed == 1
assert final_batch.output_file_id is not None
output_content = openai_client.files.content(final_batch.output_file_id)
if isinstance(output_content, str):
output_text = output_content
else:
output_text = output_content.content.decode("utf-8")
output_lines = output_text.strip().split("\n")
assert len(output_lines) == 1
result = json.loads(output_lines[0])
assert result["custom_id"] == "success-1"
assert "response" in result
assert result["response"]["status_code"] == 200
deleted_output_file = openai_client.files.delete(final_batch.output_file_id)
assert deleted_output_file.deleted
if final_batch.error_file_id is not None:
deleted_error_file = openai_client.files.delete(final_batch.error_file_id)
assert deleted_error_file.deleted

View file

@ -6,15 +6,17 @@
import inspect import inspect
import itertools import itertools
import os import os
import platform
import textwrap import textwrap
import time import time
from pathlib import Path
import pytest import pytest
from dotenv import load_dotenv from dotenv import load_dotenv
from llama_stack.log import get_logger from llama_stack.log import get_logger
from .suites import SETUP_DEFINITIONS, SUITE_DEFINITIONS
logger = get_logger(__name__, category="tests") logger = get_logger(__name__, category="tests")
@ -30,6 +32,8 @@ def pytest_runtest_makereport(item, call):
def pytest_sessionstart(session): def pytest_sessionstart(session):
# stop macOS from complaining about duplicate OpenMP libraries # stop macOS from complaining about duplicate OpenMP libraries
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
if "LLAMA_STACK_TEST_INFERENCE_MODE" not in os.environ:
os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = "replay"
def pytest_runtest_teardown(item): def pytest_runtest_teardown(item):
@ -59,9 +63,36 @@ def pytest_configure(config):
key, value = env_var.split("=", 1) key, value = env_var.split("=", 1)
os.environ[key] = value os.environ[key] = value
if platform.system() == "Darwin": # Darwin is the system name for macOS inference_mode = config.getoption("--inference-mode")
os.environ["DISABLE_CODE_SANDBOX"] = "1" os.environ["LLAMA_STACK_TEST_INFERENCE_MODE"] = inference_mode
logger.info("Setting DISABLE_CODE_SANDBOX=1 for macOS")
suite = config.getoption("--suite")
if suite:
if suite not in SUITE_DEFINITIONS:
raise pytest.UsageError(f"Unknown suite: {suite}. Available: {', '.join(sorted(SUITE_DEFINITIONS.keys()))}")
# Apply setups (global parameterizations): env + defaults
setup = config.getoption("--setup")
if suite and not setup:
setup = SUITE_DEFINITIONS[suite].default_setup
if setup:
if setup not in SETUP_DEFINITIONS:
raise pytest.UsageError(
f"Unknown setup '{setup}'. Available: {', '.join(sorted(SETUP_DEFINITIONS.keys()))}"
)
setup_obj = SETUP_DEFINITIONS[setup]
logger.info(f"Applying setup '{setup}'{' for suite ' + suite if suite else ''}")
# Apply env first
for k, v in setup_obj.env.items():
if k not in os.environ:
os.environ[k] = str(v)
# Apply defaults if not provided explicitly
for dest, value in setup_obj.defaults.items():
current = getattr(config.option, dest, None)
if not current:
setattr(config.option, dest, value)
def pytest_addoption(parser): def pytest_addoption(parser):
@ -103,16 +134,32 @@ def pytest_addoption(parser):
default=384, default=384,
help="Output dimensionality of the embedding model to use for testing. Default: 384", help="Output dimensionality of the embedding model to use for testing. Default: 384",
) )
parser.addoption( parser.addoption(
"--record-responses", "--inference-mode",
action="store_true", help="Inference mode: { record, replay, live } (default: replay)",
help="Record new API responses instead of using cached ones.", choices=["record", "replay", "live"],
default="replay",
) )
parser.addoption( parser.addoption(
"--report", "--report",
help="Path where the test report should be written, e.g. --report=/path/to/report.md", help="Path where the test report should be written, e.g. --report=/path/to/report.md",
) )
available_suites = ", ".join(sorted(SUITE_DEFINITIONS.keys()))
suite_help = (
f"Single test suite to run (narrows collection). Available: {available_suites}. Example: --suite=responses"
)
parser.addoption("--suite", help=suite_help)
# Global setups for any suite
available_setups = ", ".join(sorted(SETUP_DEFINITIONS.keys()))
setup_help = (
f"Global test setup configuration. Available: {available_setups}. "
"Can be used with any suite. Example: --setup=ollama"
)
parser.addoption("--setup", help=setup_help)
MODEL_SHORT_IDS = { MODEL_SHORT_IDS = {
"meta-llama/Llama-3.2-3B-Instruct": "3B", "meta-llama/Llama-3.2-3B-Instruct": "3B",
@ -195,3 +242,36 @@ def pytest_generate_tests(metafunc):
pytest_plugins = ["tests.integration.fixtures.common"] pytest_plugins = ["tests.integration.fixtures.common"]
def pytest_ignore_collect(path: str, config: pytest.Config) -> bool:
"""Skip collecting paths outside the selected suite roots for speed."""
suite = config.getoption("--suite")
if not suite:
return False
sobj = SUITE_DEFINITIONS.get(suite)
roots: list[str] = sobj.get("roots", []) if isinstance(sobj, dict) else getattr(sobj, "roots", [])
if not roots:
return False
p = Path(str(path)).resolve()
# Only constrain within tests/integration to avoid ignoring unrelated tests
integration_root = (Path(str(config.rootpath)) / "tests" / "integration").resolve()
if not p.is_relative_to(integration_root):
return False
for r in roots:
rp = (Path(str(config.rootpath)) / r).resolve()
if rp.is_file():
# Allow the exact file and any ancestor directories so pytest can walk into it.
if p == rp:
return False
if p.is_dir() and rp.is_relative_to(p):
return False
else:
# Allow anything inside an allowed directory
if p.is_relative_to(rp):
return False
return True

View file

@ -5,6 +5,8 @@
# the root directory of this source tree. # the root directory of this source tree.
import time
import pytest import pytest
from ..test_cases.test_case import TestCase from ..test_cases.test_case import TestCase
@ -35,6 +37,11 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
"remote::sambanova", "remote::sambanova",
"remote::tgi", "remote::tgi",
"remote::vertexai", "remote::vertexai",
# {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos,
# or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}
"remote::groq",
"remote::gemini", # https://generativelanguage.googleapis.com/v1beta/openai/completions -> 404
"remote::anthropic", # at least claude-3-{5,7}-{haiku,sonnet}-* / claude-{sonnet,opus}-4-* are not supported
): ):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.") pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI completions.")
@ -56,6 +63,26 @@ def skip_if_model_doesnt_support_suffix(client_with_models, model_id):
pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.") pytest.skip(f"Provider {provider.provider_type} doesn't support suffix.")
def skip_if_doesnt_support_n(client_with_models, model_id):
provider = provider_from_model(client_with_models, model_id)
if provider.provider_type in (
"remote::sambanova",
"remote::ollama",
# https://console.groq.com/docs/openai#currently-unsupported-openai-features
# -> Error code: 400 - {'error': {'message': "'n' : number must be at most 1", 'type': 'invalid_request_error'}}
"remote::groq",
# Error code: 400 - [{'error': {'code': 400, 'message': 'Only one candidate can be specified in the
# current model', 'status': 'INVALID_ARGUMENT'}}]
"remote::gemini",
# https://docs.anthropic.com/en/api/openai-sdk#simple-fields
"remote::anthropic",
"remote::vertexai",
# Error code: 400 - [{'error': {'code': 400, 'message': 'Unable to submit request because candidateCount must be 1 but
# the entered value was 2. Update the candidateCount value and try again.', 'status': 'INVALID_ARGUMENT'}
):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")
def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id): def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, model_id):
provider = provider_from_model(client_with_models, model_id) provider = provider_from_model(client_with_models, model_id)
if provider.provider_type in ( if provider.provider_type in (
@ -260,10 +287,7 @@ def test_openai_chat_completion_streaming(compat_client, client_with_models, tex
) )
def test_openai_chat_completion_streaming_with_n(compat_client, client_with_models, text_model_id, test_case): def test_openai_chat_completion_streaming_with_n(compat_client, client_with_models, text_model_id, test_case):
skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id) skip_if_model_doesnt_support_openai_chat_completion(client_with_models, text_model_id)
skip_if_doesnt_support_n(client_with_models, text_model_id)
provider = provider_from_model(client_with_models, text_model_id)
if provider.provider_type == "remote::ollama":
pytest.skip(f"Model {text_model_id} hosted by {provider.provider_type} doesn't support n > 1.")
tc = TestCase(test_case) tc = TestCase(test_case)
question = tc["question"] question = tc["question"]
@ -323,8 +347,15 @@ def test_inference_store(compat_client, client_with_models, text_model_id, strea
response_id = response.id response_id = response.id
content = response.choices[0].message.content content = response.choices[0].message.content
responses = client.chat.completions.list(limit=1000) tries = 0
assert response_id in [r.id for r in responses.data] while tries < 10:
responses = client.chat.completions.list(limit=1000)
if response_id in [r.id for r in responses.data]:
break
else:
tries += 1
time.sleep(0.1)
assert tries < 10, f"Response {response_id} not found after 1 second"
retrieved_response = client.chat.completions.retrieve(response_id) retrieved_response = client.chat.completions.retrieve(response_id)
assert retrieved_response.id == response_id assert retrieved_response.id == response_id
@ -388,6 +419,18 @@ def test_inference_store_tool_calls(compat_client, client_with_models, text_mode
response_id = response.id response_id = response.id
content = response.choices[0].message.content content = response.choices[0].message.content
# wait for the response to be stored
tries = 0
while tries < 10:
responses = client.chat.completions.list(limit=1000)
if response_id in [r.id for r in responses.data]:
break
else:
tries += 1
time.sleep(0.1)
assert tries < 10, f"Response {response_id} not found after 1 second"
responses = client.chat.completions.list(limit=1000) responses = client.chat.completions.list(limit=1000)
assert response_id in [r.id for r in responses.data] assert response_id in [r.id for r in responses.data]

View file

@ -20,15 +20,15 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama-guard3:1b", "model": "llama-guard3:1b",
"created_at": "2025-08-01T23:12:53.860911Z", "created_at": "2025-09-03T17:37:35.23084Z",
"done": true, "done": true,
"done_reason": "stop", "done_reason": "stop",
"total_duration": 249137667, "total_duration": 195981375,
"load_duration": 152509542, "load_duration": 110522917,
"prompt_eval_count": 216, "prompt_eval_count": 216,
"prompt_eval_duration": 71000000, "prompt_eval_duration": 72393958,
"eval_count": 2, "eval_count": 2,
"eval_duration": 24000000, "eval_duration": 11843000,
"response": "safe", "response": "safe",
"thinking": null, "thinking": null,
"context": null "context": null

View file

@ -21,7 +21,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:18.033900164Z", "created_at": "2025-09-03T17:41:43.950283Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -39,7 +39,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:18.213371151Z", "created_at": "2025-09-03T17:41:43.991122Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -57,7 +57,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:18.387513976Z", "created_at": "2025-09-03T17:41:44.031378Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -75,7 +75,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:18.564344287Z", "created_at": "2025-09-03T17:41:44.073098Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -93,7 +93,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:18.746579415Z", "created_at": "2025-09-03T17:41:44.115961Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -111,7 +111,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:18.923276047Z", "created_at": "2025-09-03T17:41:44.156517Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -129,7 +129,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:19.099961963Z", "created_at": "2025-09-03T17:41:44.197079Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -147,7 +147,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:19.275621884Z", "created_at": "2025-09-03T17:41:44.237565Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -165,7 +165,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:19.452204196Z", "created_at": "2025-09-03T17:41:44.277755Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -183,7 +183,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:19.626937514Z", "created_at": "2025-09-03T17:41:44.318476Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -201,7 +201,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:19.805566767Z", "created_at": "2025-09-03T17:41:44.358628Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -219,7 +219,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:19.985987477Z", "created_at": "2025-09-03T17:41:44.398984Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -237,7 +237,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:20.166458601Z", "created_at": "2025-09-03T17:41:44.439232Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -255,7 +255,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:20.343346795Z", "created_at": "2025-09-03T17:41:44.479478Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -273,7 +273,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:20.525008091Z", "created_at": "2025-09-03T17:41:44.520202Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -291,7 +291,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:20.709087695Z", "created_at": "2025-09-03T17:41:44.560517Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -309,7 +309,7 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:20.887074305Z", "created_at": "2025-09-03T17:41:44.601592Z",
"done": false, "done": false,
"done_reason": null, "done_reason": null,
"total_duration": null, "total_duration": null,
@ -327,15 +327,15 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"created_at": "2025-07-31T17:59:21.065244925Z", "created_at": "2025-09-03T17:41:44.642064Z",
"done": true, "done": true,
"done_reason": "stop", "done_reason": "stop",
"total_duration": 4373531496, "total_duration": 887142667,
"load_duration": 44438132, "load_duration": 119331417,
"prompt_eval_count": 56, "prompt_eval_count": 56,
"prompt_eval_duration": 1296273199, "prompt_eval_duration": 74294709,
"eval_count": 18, "eval_count": 18,
"eval_duration": 3032321735, "eval_duration": 692842791,
"response": "", "response": "",
"thinking": null, "thinking": null,
"context": null "context": null

View file

@ -20,15 +20,15 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama-guard3:1b", "model": "llama-guard3:1b",
"created_at": "2025-08-01T23:13:57.556416Z", "created_at": "2025-09-03T17:37:47.461886Z",
"done": true, "done": true,
"done_reason": "stop", "done_reason": "stop",
"total_duration": 432363250, "total_duration": 338927833,
"load_duration": 159296417, "load_duration": 100895125,
"prompt_eval_count": 223, "prompt_eval_count": 223,
"prompt_eval_duration": 257000000, "prompt_eval_duration": 221583042,
"eval_count": 2, "eval_count": 2,
"eval_duration": 14000000, "eval_duration": 12341416,
"response": "safe", "response": "safe",
"thinking": null, "thinking": null,
"context": null "context": null

View file

@ -24,7 +24,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-29", "id": "chatcmpl-414",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -39,7 +39,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1754090031, "created": 1756921333,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -50,7 +50,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-29", "id": "chatcmpl-414",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -65,7 +65,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1754090031, "created": 1756921333,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -76,7 +76,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-29", "id": "chatcmpl-414",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -91,7 +91,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1754090031, "created": 1756921333,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -102,7 +102,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-29", "id": "chatcmpl-414",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -117,7 +117,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1754090031, "created": 1756921333,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -128,7 +128,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-29", "id": "chatcmpl-414",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -143,7 +143,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1754090031, "created": 1756921334,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -154,7 +154,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-29", "id": "chatcmpl-414",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -169,7 +169,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1754090031, "created": 1756921334,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -180,7 +180,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-29", "id": "chatcmpl-414",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -195,7 +195,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1754090031, "created": 1756921334,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -206,7 +206,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-29", "id": "chatcmpl-414",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -221,7 +221,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1754090031, "created": 1756921334,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,

View file

@ -1,7 +1,7 @@
{ {
"request": { "request": {
"method": "POST", "method": "POST",
"url": "http://localhost:11434/v1/v1/chat/completions", "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {}, "headers": {},
"body": { "body": {
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
@ -20,14 +20,14 @@
"body": { "body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion", "__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": { "__data__": {
"id": "chatcmpl-368", "id": "chatcmpl-161",
"choices": [ "choices": [
{ {
"finish_reason": "stop", "finish_reason": "stop",
"index": 0, "index": 0,
"logprobs": null, "logprobs": null,
"message": { "message": {
"content": "Saturn is known for its extensive ring system.", "content": "The answer is Saturn.",
"refusal": null, "refusal": null,
"role": "assistant", "role": "assistant",
"annotations": null, "annotations": null,
@ -37,15 +37,15 @@
} }
} }
], ],
"created": 1754081853, "created": 1756921364,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion", "object": "chat.completion",
"service_tier": null, "service_tier": null,
"system_fingerprint": "fp_ollama", "system_fingerprint": "fp_ollama",
"usage": { "usage": {
"completion_tokens": 11, "completion_tokens": 6,
"prompt_tokens": 39, "prompt_tokens": 39,
"total_tokens": 50, "total_tokens": 45,
"completion_tokens_details": null, "completion_tokens_details": null,
"prompt_tokens_details": null "prompt_tokens_details": null
} }

View file

@ -20,15 +20,15 @@
"__type__": "ollama._types.GenerateResponse", "__type__": "ollama._types.GenerateResponse",
"__data__": { "__data__": {
"model": "llama-guard3:1b", "model": "llama-guard3:1b",
"created_at": "2025-08-01T23:12:51.682357Z", "created_at": "2025-09-03T17:37:33.473237Z",
"done": true, "done": true,
"done_reason": "stop", "done_reason": "stop",
"total_duration": 238161000, "total_duration": 279025042,
"load_duration": 72494750, "load_duration": 162673250,
"prompt_eval_count": 212, "prompt_eval_count": 212,
"prompt_eval_duration": 87000000, "prompt_eval_duration": 73595834,
"eval_count": 5, "eval_count": 5,
"eval_duration": 74000000, "eval_duration": 41950291,
"response": "unsafe\nS8", "response": "unsafe\nS8",
"thinking": null, "thinking": null,
"context": null "context": null

Some files were not shown because too many files have changed in this diff Show more