Merge branch 'main' into crewai

This commit is contained in:
Kai Wu 2025-09-11 09:27:27 -07:00 committed by GitHub
commit 4375764074
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
272 changed files with 28701 additions and 17713 deletions

View file

@ -2,26 +2,28 @@ name: 'Run and Record Tests'
description: 'Run integration tests and handle recording/artifact upload' description: 'Run integration tests and handle recording/artifact upload'
inputs: inputs:
test-subdirs:
description: 'Comma-separated list of test subdirectories to run'
required: true
test-pattern:
description: 'Regex pattern to pass to pytest -k'
required: false
default: ''
stack-config: stack-config:
description: 'Stack configuration to use' description: 'Stack configuration to use'
required: true required: true
provider: setup:
description: 'Provider to use for tests' description: 'Setup to use for tests (e.g., ollama, gpt, vllm)'
required: true required: false
default: ''
inference-mode: inference-mode:
description: 'Inference mode (record or replay)' description: 'Inference mode (record or replay)'
required: true required: true
run-vision-tests: suite:
description: 'Whether to run vision tests' description: 'Test suite to use: base, responses, vision, etc.'
required: false required: false
default: 'false' default: ''
subdirs:
description: 'Comma-separated list of test subdirectories to run; overrides suite'
required: false
default: ''
pattern:
description: 'Regex pattern to pass to pytest -k'
required: false
default: ''
runs: runs:
using: 'composite' using: 'composite'
@ -36,14 +38,23 @@ runs:
- name: Run Integration Tests - name: Run Integration Tests
shell: bash shell: bash
run: | run: |
uv run --no-sync ./scripts/integration-tests.sh \ SCRIPT_ARGS="--stack-config ${{ inputs.stack-config }} --inference-mode ${{ inputs.inference-mode }}"
--stack-config '${{ inputs.stack-config }}' \
--provider '${{ inputs.provider }}' \ # Add optional arguments only if they are provided
--test-subdirs '${{ inputs.test-subdirs }}' \ if [ -n '${{ inputs.setup }}' ]; then
--test-pattern '${{ inputs.test-pattern }}' \ SCRIPT_ARGS="$SCRIPT_ARGS --setup ${{ inputs.setup }}"
--inference-mode '${{ inputs.inference-mode }}' \ fi
${{ inputs.run-vision-tests == 'true' && '--run-vision-tests' || '' }} \ if [ -n '${{ inputs.suite }}' ]; then
| tee pytest-${{ inputs.inference-mode }}.log SCRIPT_ARGS="$SCRIPT_ARGS --suite ${{ inputs.suite }}"
fi
if [ -n '${{ inputs.subdirs }}' ]; then
SCRIPT_ARGS="$SCRIPT_ARGS --subdirs ${{ inputs.subdirs }}"
fi
if [ -n '${{ inputs.pattern }}' ]; then
SCRIPT_ARGS="$SCRIPT_ARGS --pattern ${{ inputs.pattern }}"
fi
uv run --no-sync ./scripts/integration-tests.sh $SCRIPT_ARGS | tee pytest-${{ inputs.inference-mode }}.log
- name: Commit and push recordings - name: Commit and push recordings
@ -57,12 +68,7 @@ runs:
echo "New recordings detected, committing and pushing" echo "New recordings detected, committing and pushing"
git add tests/integration/recordings/ git add tests/integration/recordings/
if [ "${{ inputs.run-vision-tests }}" == "true" ]; then git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})"
git commit -m "Recordings update from CI (vision)"
else
git commit -m "Recordings update from CI"
fi
git fetch origin ${{ github.ref_name }} git fetch origin ${{ github.ref_name }}
git rebase origin/${{ github.ref_name }} git rebase origin/${{ github.ref_name }}
echo "Rebased successfully" echo "Rebased successfully"

View file

@ -1,17 +1,17 @@
name: Setup Ollama name: Setup Ollama
description: Start Ollama description: Start Ollama
inputs: inputs:
run-vision-tests: suite:
description: 'Run vision tests: "true" or "false"' description: 'Test suite to use: base, responses, vision, etc.'
required: false required: false
default: 'false' default: ''
runs: runs:
using: "composite" using: "composite"
steps: steps:
- name: Start Ollama - name: Start Ollama
shell: bash shell: bash
run: | run: |
if [ "${{ inputs.run-vision-tests }}" == "true" ]; then if [ "${{ inputs.suite }}" == "vision" ]; then
image="ollama-with-vision-model" image="ollama-with-vision-model"
else else
image="ollama-with-models" image="ollama-with-models"

View file

@ -8,14 +8,14 @@ inputs:
client-version: client-version:
description: 'Client version (latest or published)' description: 'Client version (latest or published)'
required: true required: true
provider: setup:
description: 'Provider to setup (ollama or vllm)' description: 'Setup to configure (ollama, vllm, gpt, etc.)'
required: true
default: 'ollama'
run-vision-tests:
description: 'Whether to setup provider for vision tests'
required: false required: false
default: 'false' default: 'ollama'
suite:
description: 'Test suite to use: base, responses, vision, etc.'
required: false
default: ''
inference-mode: inference-mode:
description: 'Inference mode (record or replay)' description: 'Inference mode (record or replay)'
required: true required: true
@ -30,13 +30,13 @@ runs:
client-version: ${{ inputs.client-version }} client-version: ${{ inputs.client-version }}
- name: Setup ollama - name: Setup ollama
if: ${{ inputs.provider == 'ollama' && inputs.inference-mode == 'record' }} if: ${{ (inputs.setup == 'ollama' || inputs.setup == 'ollama-vision') && inputs.inference-mode == 'record' }}
uses: ./.github/actions/setup-ollama uses: ./.github/actions/setup-ollama
with: with:
run-vision-tests: ${{ inputs.run-vision-tests }} suite: ${{ inputs.suite }}
- name: Setup vllm - name: Setup vllm
if: ${{ inputs.provider == 'vllm' && inputs.inference-mode == 'record' }} if: ${{ inputs.setup == 'vllm' && inputs.inference-mode == 'record' }}
uses: ./.github/actions/setup-vllm uses: ./.github/actions/setup-vllm
- name: Build Llama Stack - name: Build Llama Stack

View file

@ -5,10 +5,11 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
| Name | File | Purpose | | Name | File | Purpose |
| ---- | ---- | ------- | | ---- | ---- | ------- |
| Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md | | Update Changelog | [changelog.yml](changelog.yml) | Creates PR for updating the CHANGELOG.md |
| API Conformance Tests | [conformance.yml](conformance.yml) | Run the API Conformance test suite on the changes. |
| Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script | | Installer CI | [install-script-ci.yml](install-script-ci.yml) | Test the installation script |
| Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication | | Integration Auth Tests | [integration-auth-tests.yml](integration-auth-tests.yml) | Run the integration test suite with Kubernetes authentication |
| SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore | | SqlStore Integration Tests | [integration-sql-store-tests.yml](integration-sql-store-tests.yml) | Run the integration test suite with SqlStore |
| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suite from tests/integration in replay mode | | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
| Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers | | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
| Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks | | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
| Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build | | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |

57
.github/workflows/conformance.yml vendored Normal file
View file

@ -0,0 +1,57 @@
# API Conformance Tests
# This workflow ensures that API changes maintain backward compatibility and don't break existing integrations
# It runs schema validation and OpenAPI diff checks to catch breaking changes early
name: API Conformance Tests
run-name: Run the API Conformance test suite on the changes.
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
types: [opened, synchronize, reopened]
paths:
- 'llama_stack/**'
- '!llama_stack/ui/**'
- 'tests/**'
- 'uv.lock'
- 'pyproject.toml'
- '.github/workflows/conformance.yml' # This workflow itself
concurrency:
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
# Cancel in-progress runs when new commits are pushed to avoid wasting CI resources
cancel-in-progress: true
jobs:
# Job to check if API schema changes maintain backward compatibility
check-schema-compatibility:
runs-on: ubuntu-latest
steps:
# Using specific version 4.1.7 because 5.0.0 fails when trying to run this locally using `act`
# This ensures consistent behavior between local testing and CI
- name: Checkout PR Code
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
# Checkout the base branch to compare against (usually main)
# This allows us to diff the current changes against the previous state
- name: Checkout Base Branch
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
ref: ${{ github.event.pull_request.base.ref }}
path: 'base'
# Install oasdiff: https://github.com/oasdiff/oasdiff, a tool for detecting breaking changes in OpenAPI specs.
- name: Install oasdiff
run: |
curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh
# Run oasdiff to detect breaking changes in the API specification
# This step will fail if incompatible changes are detected, preventing breaking changes from being merged
- name: Run OpenAPI Breaking Change Diff
run: |
oasdiff breaking --fail-on ERR base/docs/_static/llama-stack-spec.yaml docs/_static/llama-stack-spec.yaml --match-path '^/v1/openai/v1' \
--match-path '^/v1/vector-io' \
--match-path '^/v1/vector-dbs'

View file

@ -1,6 +1,6 @@
name: Integration Tests (Replay) name: Integration Tests (Replay)
run-name: Run the integration test suite from tests/integration in replay mode run-name: Run the integration test suites from tests/integration in replay mode
on: on:
push: push:
@ -28,18 +28,10 @@ on:
description: 'Test against both the latest and published versions' description: 'Test against both the latest and published versions'
type: boolean type: boolean
default: false default: false
test-provider: test-setup:
description: 'Test against a specific provider' description: 'Test against a specific setup'
type: string type: string
default: 'ollama' default: 'ollama'
test-subdirs:
description: 'Comma-separated list of test subdirectories to run'
type: string
default: ''
test-pattern:
description: 'Regex pattern to pass to pytest -k'
type: string
default: ''
concurrency: concurrency:
# Skip concurrency for pushes to main - each commit should be tested independently # Skip concurrency for pushes to main - each commit should be tested independently
@ -50,18 +42,18 @@ jobs:
run-replay-mode-tests: run-replay-mode-tests:
runs-on: ubuntu-latest runs-on: ubuntu-latest
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }} name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
client-type: [library, server] client-type: [library, server]
# Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama) # Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }} setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
# Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12 # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }} python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }} client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
run-vision-tests: [true, false] suite: [base, vision]
steps: steps:
- name: Checkout repository - name: Checkout repository
@ -72,16 +64,14 @@ jobs:
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
client-version: ${{ matrix.client-version }} client-version: ${{ matrix.client-version }}
provider: ${{ matrix.provider }} setup: ${{ matrix.setup }}
run-vision-tests: ${{ matrix.run-vision-tests }} suite: ${{ matrix.suite }}
inference-mode: 'replay' inference-mode: 'replay'
- name: Run tests - name: Run tests
uses: ./.github/actions/run-and-record-tests uses: ./.github/actions/run-and-record-tests
with: with:
test-subdirs: ${{ inputs.test-subdirs }}
test-pattern: ${{ inputs.test-pattern }}
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }} stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
provider: ${{ matrix.provider }} setup: ${{ matrix.setup }}
inference-mode: 'replay' inference-mode: 'replay'
run-vision-tests: ${{ matrix.run-vision-tests }} suite: ${{ matrix.suite }}

View file

@ -28,7 +28,7 @@ jobs:
fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }} fetch-depth: ${{ github.actor == 'dependabot[bot]' && 0 || 1 }}
- name: Set up Python - name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with: with:
python-version: '3.12' python-version: '3.12'
cache: pip cache: pip
@ -37,7 +37,7 @@ jobs:
.pre-commit-config.yaml .pre-commit-config.yaml
- name: Set up Node.js - name: Set up Node.js
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
with: with:
node-version: '20' node-version: '20'
cache: 'npm' cache: 'npm'
@ -48,7 +48,6 @@ jobs:
working-directory: llama_stack/ui working-directory: llama_stack/ui
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
continue-on-error: true
env: env:
SKIP: no-commit-to-branch SKIP: no-commit-to-branch
RUFF_OUTPUT_FORMAT: github RUFF_OUTPUT_FORMAT: github

View file

@ -24,7 +24,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@4959332f0f014c5280e7eac8b70c90cb574c9f9b # v6.6.0 uses: astral-sh/setup-uv@557e51de59eb14aaaba2ed9621916900a91d50c6 # v6.6.1
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
activate-environment: true activate-environment: true

View file

@ -10,19 +10,19 @@ run-name: Run the integration test suite from tests/integration
on: on:
workflow_dispatch: workflow_dispatch:
inputs: inputs:
test-subdirs: test-setup:
description: 'Comma-separated list of test subdirectories to run' description: 'Test against a specific setup'
type: string
default: ''
test-provider:
description: 'Test against a specific provider'
type: string type: string
default: 'ollama' default: 'ollama'
run-vision-tests: suite:
description: 'Whether to run vision tests' description: 'Test suite to use: base, responses, vision, etc.'
type: boolean type: string
default: false default: ''
test-pattern: subdirs:
description: 'Comma-separated list of test subdirectories to run; overrides suite'
type: string
default: ''
pattern:
description: 'Regex pattern to pass to pytest -k' description: 'Regex pattern to pass to pytest -k'
type: string type: string
default: '' default: ''
@ -38,11 +38,11 @@ jobs:
- name: Echo workflow inputs - name: Echo workflow inputs
run: | run: |
echo "::group::Workflow Inputs" echo "::group::Workflow Inputs"
echo "test-subdirs: ${{ inputs.test-subdirs }}"
echo "test-provider: ${{ inputs.test-provider }}"
echo "run-vision-tests: ${{ inputs.run-vision-tests }}"
echo "test-pattern: ${{ inputs.test-pattern }}"
echo "branch: ${{ github.ref_name }}" echo "branch: ${{ github.ref_name }}"
echo "test-setup: ${{ inputs.test-setup }}"
echo "suite: ${{ inputs.suite }}"
echo "subdirs: ${{ inputs.subdirs }}"
echo "pattern: ${{ inputs.pattern }}"
echo "::endgroup::" echo "::endgroup::"
- name: Checkout repository - name: Checkout repository
@ -55,16 +55,16 @@ jobs:
with: with:
python-version: "3.12" # Use single Python version for recording python-version: "3.12" # Use single Python version for recording
client-version: "latest" client-version: "latest"
provider: ${{ inputs.test-provider || 'ollama' }} setup: ${{ inputs.test-setup || 'ollama' }}
run-vision-tests: ${{ inputs.run-vision-tests }} suite: ${{ inputs.suite }}
inference-mode: 'record' inference-mode: 'record'
- name: Run and record tests - name: Run and record tests
uses: ./.github/actions/run-and-record-tests uses: ./.github/actions/run-and-record-tests
with: with:
test-pattern: ${{ inputs.test-pattern }}
test-subdirs: ${{ inputs.test-subdirs }}
stack-config: 'server:ci-tests' # recording must be done with server since more tests are run stack-config: 'server:ci-tests' # recording must be done with server since more tests are run
provider: ${{ inputs.test-provider || 'ollama' }} setup: ${{ inputs.test-setup || 'ollama' }}
inference-mode: 'record' inference-mode: 'record'
run-vision-tests: ${{ inputs.run-vision-tests }} suite: ${{ inputs.suite }}
subdirs: ${{ inputs.subdirs }}
pattern: ${{ inputs.pattern }}

View file

@ -24,7 +24,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Stale Action - name: Stale Action
uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0 uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
with: with:
stale-issue-label: 'stale' stale-issue-label: 'stale'
stale-issue-message: > stale-issue-message: >

View file

@ -29,7 +29,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Setup Node.js - name: Setup Node.js
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
with: with:
node-version: ${{ matrix.node-version }} node-version: ${{ matrix.node-version }}
cache: 'npm' cache: 'npm'

2
.gitignore vendored
View file

@ -26,5 +26,7 @@ venv/
pytest-report.xml pytest-report.xml
.coverage .coverage
.python-version .python-version
AGENTS.md
server.log
CLAUDE.md CLAUDE.md
.claude/ .claude/

View file

@ -86,7 +86,7 @@ repos:
language: python language: python
pass_filenames: false pass_filenames: false
require_serial: true require_serial: true
files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$ files: ^llama_stack/distributions/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
- id: provider-codegen - id: provider-codegen
name: Provider Codegen name: Provider Codegen
additional_dependencies: additional_dependencies:

View file

@ -1,5 +1,103 @@
# Changelog # Changelog
# v0.2.20
Published on: 2025-08-29T22:25:32Z
Here are some key changes that are coming as part of this release.
### Build and Environment
- Environment improvements: fixed env var replacement to preserve types.
- Docker stability: fixed container startup failures for Fireworks AI provider.
- Removed absolute paths in build for better portability.
### Features
- UI Enhancements: Implemented file upload and VectorDB creation/configuration directly in UI.
- Vector Store Improvements: Added keyword, vector, and hybrid search inside vector store.
- Added S3 authorization support for file providers.
- SQL Store: Added inequality support to where clause.
### Documentation
- Fixed post-training docs.
- Added Contributor Guidelines for creating Internal vs. External providers.
### Fixes
- Removed unsupported bfcl scoring function.
- Multiple reliability and configuration fixes for providers and environment handling.
### Engineering / Chores
- Cleaner internal development setup with consistent paths.
- Incremental improvements to provider integration and vector store behavior.
### New Contributors
- @omertuc made their first contribution in #3270
- @r3v5 made their first contribution in vector store hybrid search
---
# v0.2.19
Published on: 2025-08-26T22:06:55Z
## Highlights
* feat: Add CORS configuration support for server by @skamenan7 in https://github.com/llamastack/llama-stack/pull/3201
* feat(api): introduce /rerank by @ehhuang in https://github.com/llamastack/llama-stack/pull/2940
* feat: Add S3 Files Provider by @mattf in https://github.com/llamastack/llama-stack/pull/3202
---
# v0.2.18
Published on: 2025-08-20T01:09:27Z
## Highlights
* Add moderations create API
* Hybrid search in Milvus
* Numerous Responses API improvements
* Documentation updates
---
# v0.2.17
Published on: 2025-08-05T01:51:14Z
## Highlights
* feat(tests): introduce inference record/replay to increase test reliability by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2941
* fix(library_client): improve initialization error handling and prevent AttributeError by @mattf in https://github.com/meta-llama/llama-stack/pull/2944
* fix: use OLLAMA_URL to activate Ollama provider in starter by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2963
* feat(UI): adding MVP playground UI by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2828
* Standardization of errors (@nathan-weinberg)
* feat: Enable DPO training with HuggingFace inline provider by @Nehanth in https://github.com/meta-llama/llama-stack/pull/2825
* chore: rename templates to distributions by @ashwinb in https://github.com/meta-llama/llama-stack/pull/3035
---
# v0.2.16
Published on: 2025-07-28T23:35:23Z
## Highlights
* Automatic model registration for self-hosted providers (ollama and vllm currently). No need for `INFERENCE_MODEL` environment variables which need to be updated, etc.
* Much simplified starter distribution. Most `ENABLE_` env variables are now gone. When you set `VLLM_URL`, the `vllm` provider is auto-enabled. Similar for `MILVUS_URL`, `PGVECTOR_DB`, etc. Check the [run.yaml](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/starter/run.yaml) for more details.
* All tests migrated to pytest now (thanks @Elbehery)
* DPO implementation in the post-training provider (thanks @Nehanth)
* (Huge!) Support for external APIs and providers thereof (thanks @leseb, @cdoern and others). This is a really big deal -- you can now add more APIs completely out of tree and experiment with them before (optionally) wanting to contribute back.
* `inline::vllm` provider is gone thank you very much
* several improvements to OpenAI inference implementations and LiteLLM backend (thanks @mattf)
* Chroma now supports Vector Store API (thanks @franciscojavierarceo).
* Authorization improvements: Vector Store/File APIs now supports access control (thanks @franciscojavierarceo); Telemetry read APIs are gated according to logged-in user's roles.
---
# v0.2.15 # v0.2.15
Published on: 2025-07-16T03:30:01Z Published on: 2025-07-16T03:30:01Z

View file

@ -34,13 +34,12 @@ This data enables data-driven architectural decisions and performance optimizati
**1. Deploy base k8s infrastructure:** **1. Deploy base k8s infrastructure:**
```bash ```bash
cd ../k8s cd ../../docs/source/distributions/k8s
./apply.sh ./apply.sh
``` ```
**2. Deploy benchmark components:** **2. Deploy benchmark components:**
```bash ```bash
cd ../k8s-benchmark
./apply.sh ./apply.sh
``` ```
@ -56,7 +55,6 @@ kubectl get pods
**Benchmark Llama Stack (default):** **Benchmark Llama Stack (default):**
```bash ```bash
cd docs/source/distributions/k8s-benchmark/
./run-benchmark.sh ./run-benchmark.sh
``` ```

View file

@ -14,7 +14,7 @@ import os
import random import random
import statistics import statistics
import time import time
from typing import Tuple
import aiohttp import aiohttp
@ -55,10 +55,50 @@ class BenchmarkStats:
total_time = self.end_time - self.start_time total_time = self.end_time - self.start_time
success_rate = (self.success_count / self.total_requests) * 100 success_rate = (self.success_count / self.total_requests) * 100
print(f"\n{'='*60}") print(f"\n{'=' * 60}")
print(f"BENCHMARK RESULTS") print("BENCHMARK RESULTS")
print(f"{'='*60}")
print("\nResponse Time Statistics:")
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
print(f" Median: {statistics.median(self.response_times):.3f}s")
print(f" Min: {min(self.response_times):.3f}s")
print(f" Max: {max(self.response_times):.3f}s")
if len(self.response_times) > 1:
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
percentiles = [50, 90, 95, 99]
sorted_times = sorted(self.response_times)
print("\nPercentiles:")
for p in percentiles:
idx = int(len(sorted_times) * p / 100) - 1
idx = max(0, min(idx, len(sorted_times) - 1))
print(f" P{p}: {sorted_times[idx]:.3f}s")
if self.ttft_times:
print("\nTime to First Token (TTFT) Statistics:")
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
print(f" Min: {min(self.ttft_times):.3f}s")
print(f" Max: {max(self.ttft_times):.3f}s")
if len(self.ttft_times) > 1:
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
sorted_ttft = sorted(self.ttft_times)
print("\nTTFT Percentiles:")
for p in percentiles:
idx = int(len(sorted_ttft) * p / 100) - 1
idx = max(0, min(idx, len(sorted_ttft) - 1))
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
if self.chunks_received:
print("\nStreaming Statistics:")
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
print(f" Total chunks received: {sum(self.chunks_received)}")
print(f"{'=' * 60}")
print(f"Total time: {total_time:.2f}s") print(f"Total time: {total_time:.2f}s")
print(f"Concurrent users: {self.concurrent_users}") print(f"Concurrent users: {self.concurrent_users}")
print(f"Total requests: {self.total_requests}") print(f"Total requests: {self.total_requests}")
@ -66,55 +106,16 @@ class BenchmarkStats:
print(f"Failed requests: {len(self.errors)}") print(f"Failed requests: {len(self.errors)}")
print(f"Success rate: {success_rate:.1f}%") print(f"Success rate: {success_rate:.1f}%")
print(f"Requests per second: {self.success_count / total_time:.2f}") print(f"Requests per second: {self.success_count / total_time:.2f}")
print(f"\nResponse Time Statistics:")
print(f" Mean: {statistics.mean(self.response_times):.3f}s")
print(f" Median: {statistics.median(self.response_times):.3f}s")
print(f" Min: {min(self.response_times):.3f}s")
print(f" Max: {max(self.response_times):.3f}s")
if len(self.response_times) > 1:
print(f" Std Dev: {statistics.stdev(self.response_times):.3f}s")
percentiles = [50, 90, 95, 99]
sorted_times = sorted(self.response_times)
print(f"\nPercentiles:")
for p in percentiles:
idx = int(len(sorted_times) * p / 100) - 1
idx = max(0, min(idx, len(sorted_times) - 1))
print(f" P{p}: {sorted_times[idx]:.3f}s")
if self.ttft_times:
print(f"\nTime to First Token (TTFT) Statistics:")
print(f" Mean: {statistics.mean(self.ttft_times):.3f}s")
print(f" Median: {statistics.median(self.ttft_times):.3f}s")
print(f" Min: {min(self.ttft_times):.3f}s")
print(f" Max: {max(self.ttft_times):.3f}s")
if len(self.ttft_times) > 1:
print(f" Std Dev: {statistics.stdev(self.ttft_times):.3f}s")
sorted_ttft = sorted(self.ttft_times)
print(f"\nTTFT Percentiles:")
for p in percentiles:
idx = int(len(sorted_ttft) * p / 100) - 1
idx = max(0, min(idx, len(sorted_ttft) - 1))
print(f" P{p}: {sorted_ttft[idx]:.3f}s")
if self.chunks_received:
print(f"\nStreaming Statistics:")
print(f" Mean chunks per response: {statistics.mean(self.chunks_received):.1f}")
print(f" Total chunks received: {sum(self.chunks_received)}")
if self.errors: if self.errors:
print(f"\nErrors (showing first 5):") print("\nErrors (showing first 5):")
for error in self.errors[:5]: for error in self.errors[:5]:
print(f" {error}") print(f" {error}")
class LlamaStackBenchmark: class LlamaStackBenchmark:
def __init__(self, base_url: str, model_id: str): def __init__(self, base_url: str, model_id: str):
self.base_url = base_url.rstrip('/') self.base_url = base_url.rstrip("/")
self.model_id = model_id self.model_id = model_id
self.headers = {"Content-Type": "application/json"} self.headers = {"Content-Type": "application/json"}
self.test_messages = [ self.test_messages = [
@ -125,74 +126,67 @@ class LlamaStackBenchmark:
[ [
{"role": "user", "content": "What is machine learning?"}, {"role": "user", "content": "What is machine learning?"},
{"role": "assistant", "content": "Machine learning is a subset of AI..."}, {"role": "assistant", "content": "Machine learning is a subset of AI..."},
{"role": "user", "content": "Can you give me a practical example?"} {"role": "user", "content": "Can you give me a practical example?"},
] ],
] ]
async def make_async_streaming_request(self) -> tuple[float, int, float | None, str | None]:
async def make_async_streaming_request(self) -> Tuple[float, int, float | None, str | None]:
"""Make a single async streaming chat completion request.""" """Make a single async streaming chat completion request."""
messages = random.choice(self.test_messages) messages = random.choice(self.test_messages)
payload = { payload = {"model": self.model_id, "messages": messages, "stream": True, "max_tokens": 100}
"model": self.model_id,
"messages": messages,
"stream": True,
"max_tokens": 100
}
start_time = time.time() start_time = time.time()
chunks_received = 0 chunks_received = 0
ttft = None ttft = None
error = None error = None
session = aiohttp.ClientSession() session = aiohttp.ClientSession()
try: try:
async with session.post( async with session.post(
f"{self.base_url}/chat/completions", f"{self.base_url}/chat/completions",
headers=self.headers, headers=self.headers,
json=payload, json=payload,
timeout=aiohttp.ClientTimeout(total=30) timeout=aiohttp.ClientTimeout(total=30),
) as response: ) as response:
if response.status == 200: if response.status == 200:
async for line in response.content: async for line in response.content:
if line: if line:
line_str = line.decode('utf-8').strip() line_str = line.decode("utf-8").strip()
if line_str.startswith('data: '): if line_str.startswith("data: "):
chunks_received += 1 chunks_received += 1
if ttft is None: if ttft is None:
ttft = time.time() - start_time ttft = time.time() - start_time
if line_str == 'data: [DONE]': if line_str == "data: [DONE]":
break break
if chunks_received == 0: if chunks_received == 0:
error = "No streaming chunks received" error = "No streaming chunks received"
else: else:
text = await response.text() text = await response.text()
error = f"HTTP {response.status}: {text[:100]}" error = f"HTTP {response.status}: {text[:100]}"
except Exception as e: except Exception as e:
error = f"Request error: {str(e)}" error = f"Request error: {str(e)}"
finally: finally:
await session.close() await session.close()
response_time = time.time() - start_time response_time = time.time() - start_time
return response_time, chunks_received, ttft, error return response_time, chunks_received, ttft, error
async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats: async def run_benchmark(self, duration: int, concurrent_users: int) -> BenchmarkStats:
"""Run benchmark using async requests for specified duration.""" """Run benchmark using async requests for specified duration."""
stats = BenchmarkStats() stats = BenchmarkStats()
stats.concurrent_users = concurrent_users stats.concurrent_users = concurrent_users
stats.start_time = time.time() stats.start_time = time.time()
print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users") print(f"Starting benchmark: {duration}s duration, {concurrent_users} concurrent users")
print(f"Target URL: {self.base_url}/chat/completions") print(f"Target URL: {self.base_url}/chat/completions")
print(f"Model: {self.model_id}") print(f"Model: {self.model_id}")
connector = aiohttp.TCPConnector(limit=concurrent_users) connector = aiohttp.TCPConnector(limit=concurrent_users)
async with aiohttp.ClientSession(connector=connector) as session: async with aiohttp.ClientSession(connector=connector):
async def worker(worker_id: int): async def worker(worker_id: int):
"""Worker that sends requests sequentially until canceled.""" """Worker that sends requests sequentially until canceled."""
request_count = 0 request_count = 0
@ -201,12 +195,12 @@ class LlamaStackBenchmark:
response_time, chunks, ttft, error = await self.make_async_streaming_request() response_time, chunks, ttft, error = await self.make_async_streaming_request()
await stats.add_result(response_time, chunks, ttft, error) await stats.add_result(response_time, chunks, ttft, error)
request_count += 1 request_count += 1
except asyncio.CancelledError: except asyncio.CancelledError:
break break
except Exception as e: except Exception as e:
await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}") await stats.add_result(0, 0, None, f"Worker {worker_id} error: {str(e)}")
# Progress reporting task # Progress reporting task
async def progress_reporter(): async def progress_reporter():
last_report_time = time.time() last_report_time = time.time()
@ -215,48 +209,52 @@ class LlamaStackBenchmark:
await asyncio.sleep(1) # Report every second await asyncio.sleep(1) # Report every second
if time.time() >= last_report_time + 10: # Report every 10 seconds if time.time() >= last_report_time + 10: # Report every 10 seconds
elapsed = time.time() - stats.start_time elapsed = time.time() - stats.start_time
print(f"Completed: {stats.total_requests} requests in {elapsed:.1f}s") print(
f"Completed: {stats.total_requests} requests in {elapsed:.1f}s, RPS: {stats.total_requests / elapsed:.1f}"
)
last_report_time = time.time() last_report_time = time.time()
except asyncio.CancelledError: except asyncio.CancelledError:
break break
# Spawn concurrent workers # Spawn concurrent workers
tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)] tasks = [asyncio.create_task(worker(i)) for i in range(concurrent_users)]
progress_task = asyncio.create_task(progress_reporter()) progress_task = asyncio.create_task(progress_reporter())
tasks.append(progress_task) tasks.append(progress_task)
# Wait for duration then cancel all tasks # Wait for duration then cancel all tasks
await asyncio.sleep(duration) await asyncio.sleep(duration)
for task in tasks: for task in tasks:
task.cancel() task.cancel()
# Wait for all tasks to complete # Wait for all tasks to complete
await asyncio.gather(*tasks, return_exceptions=True) await asyncio.gather(*tasks, return_exceptions=True)
stats.end_time = time.time() stats.end_time = time.time()
return stats return stats
def main(): def main():
parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool") parser = argparse.ArgumentParser(description="Llama Stack Benchmark Tool")
parser.add_argument("--base-url", default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"), parser.add_argument(
help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)") "--base-url",
parser.add_argument("--model", default=os.getenv("INFERENCE_MODEL", "test-model"), default=os.getenv("BENCHMARK_BASE_URL", "http://localhost:8000/v1/openai/v1"),
help="Model ID to use for requests") help="Base URL for the API (default: http://localhost:8000/v1/openai/v1)",
parser.add_argument("--duration", type=int, default=60, )
help="Duration in seconds to run benchmark (default: 60)") parser.add_argument(
parser.add_argument("--concurrent", type=int, default=10, "--model", default=os.getenv("INFERENCE_MODEL", "test-model"), help="Model ID to use for requests"
help="Number of concurrent users (default: 10)") )
parser.add_argument("--duration", type=int, default=60, help="Duration in seconds to run benchmark (default: 60)")
parser.add_argument("--concurrent", type=int, default=10, help="Number of concurrent users (default: 10)")
args = parser.parse_args() args = parser.parse_args()
benchmark = LlamaStackBenchmark(args.base_url, args.model) benchmark = LlamaStackBenchmark(args.base_url, args.model)
try: try:
stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent)) stats = asyncio.run(benchmark.run_benchmark(args.duration, args.concurrent))
stats.print_summary() stats.print_summary()
except KeyboardInterrupt: except KeyboardInterrupt:
print("\nBenchmark interrupted by user") print("\nBenchmark interrupted by user")
except Exception as e: except Exception as e:

View file

@ -11,180 +11,192 @@ OpenAI-compatible mock server that returns:
- Valid OpenAI-formatted chat completion responses with dynamic content - Valid OpenAI-formatted chat completion responses with dynamic content
""" """
from flask import Flask, request, jsonify, Response
import time
import random
import uuid
import json
import argparse import argparse
import json
import os import os
import random
import time
import uuid
from flask import Flask, Response, jsonify, request
app = Flask(__name__) app = Flask(__name__)
# Models from environment variables # Models from environment variables
def get_models(): def get_models():
models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct") models_str = os.getenv("MOCK_MODELS", "meta-llama/Llama-3.2-3B-Instruct")
model_ids = [m.strip() for m in models_str.split(",") if m.strip()] model_ids = [m.strip() for m in models_str.split(",") if m.strip()]
return { return {
"object": "list", "object": "list",
"data": [ "data": [
{ {"id": model_id, "object": "model", "created": 1234567890, "owned_by": "vllm"} for model_id in model_ids
"id": model_id, ],
"object": "model",
"created": 1234567890,
"owned_by": "vllm"
}
for model_id in model_ids
]
} }
def generate_random_text(length=50): def generate_random_text(length=50):
"""Generate random but coherent text for responses.""" """Generate random but coherent text for responses."""
words = [ words = [
"Hello", "there", "I'm", "an", "AI", "assistant", "ready", "to", "help", "you", "Hello",
"with", "your", "questions", "and", "tasks", "today", "Let", "me","know", "what", "there",
"you'd", "like", "to", "discuss", "or", "explore", "together", "I", "can", "assist", "I'm",
"with", "various", "topics", "including", "coding", "writing", "analysis", "and", "more" "an",
"AI",
"assistant",
"ready",
"to",
"help",
"you",
"with",
"your",
"questions",
"and",
"tasks",
"today",
"Let",
"me",
"know",
"what",
"you'd",
"like",
"to",
"discuss",
"or",
"explore",
"together",
"I",
"can",
"assist",
"with",
"various",
"topics",
"including",
"coding",
"writing",
"analysis",
"and",
"more",
] ]
return " ".join(random.choices(words, k=length)) return " ".join(random.choices(words, k=length))
@app.route('/v1/models', methods=['GET'])
@app.route("/v1/models", methods=["GET"])
def list_models(): def list_models():
models = get_models() models = get_models()
print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}") print(f"[MOCK] Returning models: {[m['id'] for m in models['data']]}")
return jsonify(models) return jsonify(models)
@app.route('/v1/chat/completions', methods=['POST'])
@app.route("/v1/chat/completions", methods=["POST"])
def chat_completions(): def chat_completions():
"""Return OpenAI-formatted chat completion responses.""" """Return OpenAI-formatted chat completion responses."""
data = request.get_json() data = request.get_json()
default_model = get_models()['data'][0]['id'] default_model = get_models()["data"][0]["id"]
model = data.get('model', default_model) model = data.get("model", default_model)
messages = data.get('messages', []) messages = data.get("messages", [])
stream = data.get('stream', False) stream = data.get("stream", False)
print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}") print(f"[MOCK] Chat completion request - model: {model}, stream: {stream}")
if stream: if stream:
return handle_streaming_completion(model, messages) return handle_streaming_completion(model, messages)
else: else:
return handle_non_streaming_completion(model, messages) return handle_non_streaming_completion(model, messages)
def handle_non_streaming_completion(model, messages): def handle_non_streaming_completion(model, messages):
response_text = generate_random_text(random.randint(20, 80)) response_text = generate_random_text(random.randint(20, 80))
# Calculate realistic token counts # Calculate realistic token counts
prompt_tokens = sum(len(str(msg.get('content', '')).split()) for msg in messages) prompt_tokens = sum(len(str(msg.get("content", "")).split()) for msg in messages)
completion_tokens = len(response_text.split()) completion_tokens = len(response_text.split())
response = { response = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}", "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion", "object": "chat.completion",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
{
"index": 0,
"message": {
"role": "assistant",
"content": response_text
},
"finish_reason": "stop"
}
],
"usage": { "usage": {
"prompt_tokens": prompt_tokens, "prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens, "completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens "total_tokens": prompt_tokens + completion_tokens,
} },
} }
return jsonify(response) return jsonify(response)
def handle_streaming_completion(model, messages): def handle_streaming_completion(model, messages):
def generate_stream(): def generate_stream():
# Generate response text # Generate response text
full_response = generate_random_text(random.randint(30, 100)) full_response = generate_random_text(random.randint(30, 100))
words = full_response.split() words = full_response.split()
# Send initial chunk # Send initial chunk
initial_chunk = { initial_chunk = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}", "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}}],
{
"index": 0,
"delta": {"role": "assistant", "content": ""}
}
]
} }
yield f"data: {json.dumps(initial_chunk)}\n\n" yield f"data: {json.dumps(initial_chunk)}\n\n"
# Send word by word # Send word by word
for i, word in enumerate(words): for i, word in enumerate(words):
chunk = { chunk = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}", "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "delta": {"content": f"{word} " if i < len(words) - 1 else word}}],
{
"index": 0,
"delta": {"content": f"{word} " if i < len(words) - 1 else word}
}
]
} }
yield f"data: {json.dumps(chunk)}\n\n" yield f"data: {json.dumps(chunk)}\n\n"
# Configurable delay to simulate realistic streaming # Configurable delay to simulate realistic streaming
stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005")) stream_delay = float(os.getenv("STREAM_DELAY_SECONDS", "0.005"))
time.sleep(stream_delay) time.sleep(stream_delay)
# Send final chunk # Send final chunk
final_chunk = { final_chunk = {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}", "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"created": int(time.time()), "created": int(time.time()),
"model": model, "model": model,
"choices": [ "choices": [{"index": 0, "delta": {"content": ""}, "finish_reason": "stop"}],
{
"index": 0,
"delta": {"content": ""},
"finish_reason": "stop"
}
]
} }
yield f"data: {json.dumps(final_chunk)}\n\n" yield f"data: {json.dumps(final_chunk)}\n\n"
yield "data: [DONE]\n\n" yield "data: [DONE]\n\n"
return Response( return Response(
generate_stream(), generate_stream(),
mimetype='text/event-stream', mimetype="text/event-stream",
headers={ headers={
'Cache-Control': 'no-cache', "Cache-Control": "no-cache",
'Connection': 'keep-alive', "Connection": "keep-alive",
'Access-Control-Allow-Origin': '*', "Access-Control-Allow-Origin": "*",
} },
) )
@app.route('/health', methods=['GET'])
@app.route("/health", methods=["GET"])
def health(): def health():
return jsonify({"status": "healthy", "type": "openai-mock"}) return jsonify({"status": "healthy", "type": "openai-mock"})
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='OpenAI-compatible mock server') if __name__ == "__main__":
parser.add_argument('--port', type=int, default=8081, parser = argparse.ArgumentParser(description="OpenAI-compatible mock server")
help='Port to run the server on (default: 8081)') parser.add_argument("--port", type=int, default=8081, help="Port to run the server on (default: 8081)")
args = parser.parse_args() args = parser.parse_args()
port = args.port port = args.port
models = get_models() models = get_models()
print("Starting OpenAI-compatible mock server...") print("Starting OpenAI-compatible mock server...")
print(f"- /models endpoint with: {[m['id'] for m in models['data']]}") print(f"- /models endpoint with: {[m['id'] for m in models['data']]}")
print("- OpenAI-formatted chat/completion responses with dynamic content") print("- OpenAI-formatted chat/completion responses with dynamic content")
print("- Streaming support with valid SSE format") print("- Streaming support with valid SSE format")
print(f"- Listening on: http://0.0.0.0:{port}") print(f"- Listening on: http://0.0.0.0:{port}")
app.run(host='0.0.0.0', port=port, debug=False) app.run(host="0.0.0.0", port=port, debug=False)

View file

@ -6,6 +6,7 @@ data:
apis: apis:
- agents - agents
- inference - inference
- files
- safety - safety
- telemetry - telemetry
- tool_runtime - tool_runtime
@ -19,13 +20,6 @@ data:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096} max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake} api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true} tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
config: {} config: {}
@ -41,6 +35,14 @@ data:
db: ${env.POSTGRES_DB:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack} user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard
@ -111,9 +113,6 @@ data:
- model_id: ${env.INFERENCE_MODEL} - model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference provider_id: vllm-inference
model_type: llm model_type: llm
- model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
model_type: llm
shields: shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: [] vector_dbs: []

View file

@ -2,7 +2,10 @@ version: '2'
image_name: kubernetes-benchmark-demo image_name: kubernetes-benchmark-demo
apis: apis:
- agents - agents
- files
- inference - inference
- files
- safety
- telemetry - telemetry
- tool_runtime - tool_runtime
- vector_io - vector_io
@ -18,6 +21,14 @@ providers:
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
config: {} config: {}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
vector_io: vector_io:
- provider_id: ${env.ENABLE_CHROMADB:+chromadb} - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
provider_type: remote::chromadb provider_type: remote::chromadb
@ -30,6 +41,19 @@ providers:
db: ${env.POSTGRES_DB:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack} user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents: agents:
- provider_id: meta-reference - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
@ -95,6 +119,8 @@ models:
- model_id: ${env.INFERENCE_MODEL} - model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference provider_id: vllm-inference
model_type: llm model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: [] vector_dbs: []
datasets: [] datasets: []
scoring_fns: [] scoring_fns: []

View file

@ -1,5 +1,106 @@
@import url("theme.css"); @import url("theme.css");
/* Horizontal Navigation Bar */
.horizontal-nav {
background-color: #ffffff;
border-bottom: 1px solid #e5e5e5;
padding: 0;
position: fixed;
top: 0;
left: 0;
right: 0;
z-index: 1050;
height: 50px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
[data-theme="dark"] .horizontal-nav {
background-color: #1a1a1a;
border-bottom: 1px solid #333;
}
.horizontal-nav .nav-container {
max-width: 1200px;
margin: 0 auto;
display: flex;
align-items: center;
justify-content: space-between;
padding: 0 20px;
height: 100%;
}
.horizontal-nav .nav-brand {
font-size: 18px;
font-weight: 600;
color: #333;
text-decoration: none;
}
[data-theme="dark"] .horizontal-nav .nav-brand {
color: #fff;
}
.horizontal-nav .nav-links {
display: flex;
align-items: center;
gap: 30px;
list-style: none;
margin: 0;
padding: 0;
}
.horizontal-nav .nav-links a {
color: #666;
text-decoration: none;
font-size: 14px;
font-weight: 500;
padding: 8px 12px;
border-radius: 6px;
transition: all 0.2s ease;
}
.horizontal-nav .nav-links a:hover,
.horizontal-nav .nav-links a.active {
color: #333;
background-color: #f5f5f5;
}
.horizontal-nav .nav-links a.active {
font-weight: 600;
}
[data-theme="dark"] .horizontal-nav .nav-links a {
color: #ccc;
}
[data-theme="dark"] .horizontal-nav .nav-links a:hover,
[data-theme="dark"] .horizontal-nav .nav-links a.active {
color: #fff;
background-color: #333;
}
.horizontal-nav .nav-links .github-link {
display: flex;
align-items: center;
gap: 6px;
}
.horizontal-nav .nav-links .github-icon {
width: 16px;
height: 16px;
fill: currentColor;
}
/* Adjust main content to account for fixed nav */
.wy-nav-side {
top: 50px;
height: calc(100vh - 50px);
}
.wy-nav-content-wrap {
margin-top: 50px;
}
.wy-nav-content { .wy-nav-content {
max-width: 90%; max-width: 90%;
} }

44
docs/_static/js/horizontal_nav.js vendored Normal file
View file

@ -0,0 +1,44 @@
// Horizontal Navigation Bar for Llama Stack Documentation
document.addEventListener('DOMContentLoaded', function() {
// Create the horizontal navigation HTML
const navHTML = `
<nav class="horizontal-nav">
<div class="nav-container">
<a href="/" class="nav-brand">Llama Stack</a>
<ul class="nav-links">
<li><a href="/">Docs</a></li>
<li><a href="/references/api_reference/">API Reference</a></li>
<li><a href="https://github.com/meta-llama/llama-stack" target="_blank" class="github-link">
<svg class="github-icon" viewBox="0 0 16 16" aria-hidden="true">
<path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"/>
</svg>
GitHub
</a></li>
</ul>
</div>
</nav>
`;
// Insert the navigation at the beginning of the body
document.body.insertAdjacentHTML('afterbegin', navHTML);
// Update navigation links based on current page
updateActiveNav();
});
function updateActiveNav() {
const currentPath = window.location.pathname;
const navLinks = document.querySelectorAll('.horizontal-nav .nav-links a');
navLinks.forEach(link => {
// Remove any existing active classes
link.classList.remove('active');
// Add active class based on current path
if (currentPath === '/' && link.getAttribute('href') === '/') {
link.classList.add('active');
} else if (currentPath.includes('/references/api_reference/') && link.getAttribute('href').includes('api_reference')) {
link.classList.add('active');
}
});
}

View file

@ -633,6 +633,80 @@
} }
} }
}, },
"/v1/prompts": {
"get": {
"responses": {
"200": {
"description": "A ListPromptsResponse containing all prompts.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ListPromptsResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "List all prompts.",
"parameters": []
},
"post": {
"responses": {
"200": {
"description": "The created Prompt resource.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Prompt"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Create a new prompt.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CreatePromptRequest"
}
}
},
"required": true
}
}
},
"/v1/agents/{agent_id}": { "/v1/agents/{agent_id}": {
"get": { "get": {
"responses": { "responses": {
@ -901,6 +975,143 @@
] ]
} }
}, },
"/v1/prompts/{prompt_id}": {
"get": {
"responses": {
"200": {
"description": "A Prompt resource.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Prompt"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Get a prompt by its identifier and optional version.",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt to get.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "version",
"in": "query",
"description": "The version of the prompt to get (defaults to latest).",
"required": false,
"schema": {
"type": "integer"
}
}
]
},
"post": {
"responses": {
"200": {
"description": "The updated Prompt resource with incremented version.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Prompt"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Update an existing prompt (increments version).",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt to update.",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/UpdatePromptRequest"
}
}
},
"required": true
}
},
"delete": {
"responses": {
"200": {
"description": "OK"
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Delete a prompt.",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt to delete.",
"required": true,
"schema": {
"type": "string"
}
}
]
}
},
"/v1/inference/embeddings": { "/v1/inference/embeddings": {
"post": { "post": {
"responses": { "responses": {
@ -2836,6 +3047,49 @@
] ]
} }
}, },
"/v1/prompts/{prompt_id}/versions": {
"get": {
"responses": {
"200": {
"description": "A ListPromptsResponse containing all versions of the prompt.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ListPromptsResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "List all versions of a specific prompt.",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt to list versions for.",
"required": true,
"schema": {
"type": "string"
}
}
]
}
},
"/v1/providers": { "/v1/providers": {
"get": { "get": {
"responses": { "responses": {
@ -5007,6 +5261,59 @@
} }
} }
}, },
"/v1/prompts/{prompt_id}/set-default-version": {
"post": {
"responses": {
"200": {
"description": "The prompt with the specified version now set as default.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Prompt"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Prompts"
],
"description": "Set which version of a prompt should be the default in get_prompt (latest).",
"parameters": [
{
"name": "prompt_id",
"in": "path",
"description": "The identifier of the prompt.",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SetDefaultVersionRequest"
}
}
},
"required": true
}
}
},
"/v1/post-training/supervised-fine-tune": { "/v1/post-training/supervised-fine-tune": {
"post": { "post": {
"responses": { "responses": {
@ -9670,6 +9977,65 @@
], ],
"title": "OpenAIResponseObjectStreamResponseWebSearchCallSearching" "title": "OpenAIResponseObjectStreamResponseWebSearchCallSearching"
}, },
"CreatePromptRequest": {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The prompt text content with variable placeholders."
},
"variables": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of variable names that can be used in the prompt template."
}
},
"additionalProperties": false,
"required": [
"prompt"
],
"title": "CreatePromptRequest"
},
"Prompt": {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The system prompt text with variable placeholders. Variables are only supported when using the Responses API."
},
"version": {
"type": "integer",
"description": "Version (integer starting at 1, incremented on save)"
},
"prompt_id": {
"type": "string",
"description": "Unique identifier formatted as 'pmpt_<48-digit-hash>'"
},
"variables": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of prompt variable names that can be used in the prompt template"
},
"is_default": {
"type": "boolean",
"default": false,
"description": "Boolean indicating whether this version is the default version for this prompt"
}
},
"additionalProperties": false,
"required": [
"version",
"prompt_id",
"variables",
"is_default"
],
"title": "Prompt",
"description": "A prompt resource representing a stored OpenAI Compatible prompt template in Llama Stack."
},
"OpenAIDeleteResponseObject": { "OpenAIDeleteResponseObject": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -10296,7 +10662,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "benchmark", "const": "benchmark",
"default": "benchmark", "default": "benchmark",
@ -10923,7 +11290,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "dataset", "const": "dataset",
"default": "dataset", "default": "dataset",
@ -11073,7 +11441,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "model", "const": "model",
"default": "model", "default": "model",
@ -11338,7 +11707,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "scoring_function", "const": "scoring_function",
"default": "scoring_function", "default": "scoring_function",
@ -11446,7 +11816,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "shield", "const": "shield",
"default": "shield", "default": "shield",
@ -11691,7 +12062,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "tool", "const": "tool",
"default": "tool", "default": "tool",
@ -11773,7 +12145,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "tool_group", "const": "tool_group",
"default": "tool_group", "default": "tool_group",
@ -12067,7 +12440,8 @@
"scoring_function", "scoring_function",
"benchmark", "benchmark",
"tool", "tool",
"tool_group" "tool_group",
"prompt"
], ],
"const": "vector_db", "const": "vector_db",
"default": "vector_db", "default": "vector_db",
@ -12882,6 +13256,23 @@
"title": "OpenAIResponseObjectWithInput", "title": "OpenAIResponseObjectWithInput",
"description": "OpenAI response object extended with input context information." "description": "OpenAI response object extended with input context information."
}, },
"ListPromptsResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Prompt"
}
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "ListPromptsResponse",
"description": "Response model to list prompts."
},
"ListProvidersResponse": { "ListProvidersResponse": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -17128,6 +17519,20 @@
"title": "ScoreBatchResponse", "title": "ScoreBatchResponse",
"description": "Response from batch scoring operations on datasets." "description": "Response from batch scoring operations on datasets."
}, },
"SetDefaultVersionRequest": {
"type": "object",
"properties": {
"version": {
"type": "integer",
"description": "The version to set as default."
}
},
"additionalProperties": false,
"required": [
"version"
],
"title": "SetDefaultVersionRequest"
},
"AlgorithmConfig": { "AlgorithmConfig": {
"oneOf": [ "oneOf": [
{ {
@ -17412,6 +17817,37 @@
"title": "SyntheticDataGenerationResponse", "title": "SyntheticDataGenerationResponse",
"description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold." "description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."
}, },
"UpdatePromptRequest": {
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "The updated prompt text content."
},
"version": {
"type": "integer",
"description": "The current version of the prompt being updated."
},
"variables": {
"type": "array",
"items": {
"type": "string"
},
"description": "Updated list of variable names that can be used in the prompt template."
},
"set_as_default": {
"type": "boolean",
"description": "Set the new version as the default (default=True)."
}
},
"additionalProperties": false,
"required": [
"prompt",
"version",
"set_as_default"
],
"title": "UpdatePromptRequest"
},
"VersionInfo": { "VersionInfo": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -17537,6 +17973,10 @@
{ {
"name": "PostTraining (Coming Soon)" "name": "PostTraining (Coming Soon)"
}, },
{
"name": "Prompts",
"x-displayName": "Protocol for prompt management operations."
},
{ {
"name": "Providers", "name": "Providers",
"x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations." "x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
@ -17587,6 +18027,7 @@
"Inspect", "Inspect",
"Models", "Models",
"PostTraining (Coming Soon)", "PostTraining (Coming Soon)",
"Prompts",
"Providers", "Providers",
"Safety", "Safety",
"Scoring", "Scoring",

View file

@ -427,6 +427,58 @@ paths:
schema: schema:
$ref: '#/components/schemas/CreateOpenaiResponseRequest' $ref: '#/components/schemas/CreateOpenaiResponseRequest'
required: true required: true
/v1/prompts:
get:
responses:
'200':
description: >-
A ListPromptsResponse containing all prompts.
content:
application/json:
schema:
$ref: '#/components/schemas/ListPromptsResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: List all prompts.
parameters: []
post:
responses:
'200':
description: The created Prompt resource.
content:
application/json:
schema:
$ref: '#/components/schemas/Prompt'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: Create a new prompt.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/CreatePromptRequest'
required: true
/v1/agents/{agent_id}: /v1/agents/{agent_id}:
get: get:
responses: responses:
@ -616,6 +668,103 @@ paths:
required: true required: true
schema: schema:
type: string type: string
/v1/prompts/{prompt_id}:
get:
responses:
'200':
description: A Prompt resource.
content:
application/json:
schema:
$ref: '#/components/schemas/Prompt'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: >-
Get a prompt by its identifier and optional version.
parameters:
- name: prompt_id
in: path
description: The identifier of the prompt to get.
required: true
schema:
type: string
- name: version
in: query
description: >-
The version of the prompt to get (defaults to latest).
required: false
schema:
type: integer
post:
responses:
'200':
description: >-
The updated Prompt resource with incremented version.
content:
application/json:
schema:
$ref: '#/components/schemas/Prompt'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: >-
Update an existing prompt (increments version).
parameters:
- name: prompt_id
in: path
description: The identifier of the prompt to update.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/UpdatePromptRequest'
required: true
delete:
responses:
'200':
description: OK
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: Delete a prompt.
parameters:
- name: prompt_id
in: path
description: The identifier of the prompt to delete.
required: true
schema:
type: string
/v1/inference/embeddings: /v1/inference/embeddings:
post: post:
responses: responses:
@ -1983,6 +2132,37 @@ paths:
required: false required: false
schema: schema:
$ref: '#/components/schemas/Order' $ref: '#/components/schemas/Order'
/v1/prompts/{prompt_id}/versions:
get:
responses:
'200':
description: >-
A ListPromptsResponse containing all versions of the prompt.
content:
application/json:
schema:
$ref: '#/components/schemas/ListPromptsResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: List all versions of a specific prompt.
parameters:
- name: prompt_id
in: path
description: >-
The identifier of the prompt to list versions for.
required: true
schema:
type: string
/v1/providers: /v1/providers:
get: get:
responses: responses:
@ -3546,6 +3726,43 @@ paths:
schema: schema:
$ref: '#/components/schemas/ScoreBatchRequest' $ref: '#/components/schemas/ScoreBatchRequest'
required: true required: true
/v1/prompts/{prompt_id}/set-default-version:
post:
responses:
'200':
description: >-
The prompt with the specified version now set as default.
content:
application/json:
schema:
$ref: '#/components/schemas/Prompt'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
description: >-
Set which version of a prompt should be the default in get_prompt (latest).
parameters:
- name: prompt_id
in: path
description: The identifier of the prompt.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/SetDefaultVersionRequest'
required: true
/v1/post-training/supervised-fine-tune: /v1/post-training/supervised-fine-tune:
post: post:
responses: responses:
@ -7148,6 +7365,61 @@ components:
- type - type
title: >- title: >-
OpenAIResponseObjectStreamResponseWebSearchCallSearching OpenAIResponseObjectStreamResponseWebSearchCallSearching
CreatePromptRequest:
type: object
properties:
prompt:
type: string
description: >-
The prompt text content with variable placeholders.
variables:
type: array
items:
type: string
description: >-
List of variable names that can be used in the prompt template.
additionalProperties: false
required:
- prompt
title: CreatePromptRequest
Prompt:
type: object
properties:
prompt:
type: string
description: >-
The system prompt text with variable placeholders. Variables are only
supported when using the Responses API.
version:
type: integer
description: >-
Version (integer starting at 1, incremented on save)
prompt_id:
type: string
description: >-
Unique identifier formatted as 'pmpt_<48-digit-hash>'
variables:
type: array
items:
type: string
description: >-
List of prompt variable names that can be used in the prompt template
is_default:
type: boolean
default: false
description: >-
Boolean indicating whether this version is the default version for this
prompt
additionalProperties: false
required:
- version
- prompt_id
- variables
- is_default
title: Prompt
description: >-
A prompt resource representing a stored OpenAI Compatible prompt template
in Llama Stack.
OpenAIDeleteResponseObject: OpenAIDeleteResponseObject:
type: object type: object
properties: properties:
@ -7621,6 +7893,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: benchmark const: benchmark
default: benchmark default: benchmark
description: The resource type, always benchmark description: The resource type, always benchmark
@ -8107,6 +8380,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: dataset const: dataset
default: dataset default: dataset
description: >- description: >-
@ -8219,6 +8493,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: model const: model
default: model default: model
description: >- description: >-
@ -8410,6 +8685,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: scoring_function const: scoring_function
default: scoring_function default: scoring_function
description: >- description: >-
@ -8486,6 +8762,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: shield const: shield
default: shield default: shield
description: The resource type, always shield description: The resource type, always shield
@ -8665,6 +8942,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: tool const: tool
default: tool default: tool
description: Type of resource, always 'tool' description: Type of resource, always 'tool'
@ -8723,6 +9001,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: tool_group const: tool_group
default: tool_group default: tool_group
description: Type of resource, always 'tool_group' description: Type of resource, always 'tool_group'
@ -8951,6 +9230,7 @@ components:
- benchmark - benchmark
- tool - tool
- tool_group - tool_group
- prompt
const: vector_db const: vector_db
default: vector_db default: vector_db
description: >- description: >-
@ -9577,6 +9857,18 @@ components:
title: OpenAIResponseObjectWithInput title: OpenAIResponseObjectWithInput
description: >- description: >-
OpenAI response object extended with input context information. OpenAI response object extended with input context information.
ListPromptsResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/Prompt'
additionalProperties: false
required:
- data
title: ListPromptsResponse
description: Response model to list prompts.
ListProvidersResponse: ListProvidersResponse:
type: object type: object
properties: properties:
@ -12722,6 +13014,16 @@ components:
title: ScoreBatchResponse title: ScoreBatchResponse
description: >- description: >-
Response from batch scoring operations on datasets. Response from batch scoring operations on datasets.
SetDefaultVersionRequest:
type: object
properties:
version:
type: integer
description: The version to set as default.
additionalProperties: false
required:
- version
title: SetDefaultVersionRequest
AlgorithmConfig: AlgorithmConfig:
oneOf: oneOf:
- $ref: '#/components/schemas/LoraFinetuningConfig' - $ref: '#/components/schemas/LoraFinetuningConfig'
@ -12918,6 +13220,32 @@ components:
description: >- description: >-
Response from the synthetic data generation. Batch of (prompt, response, score) Response from the synthetic data generation. Batch of (prompt, response, score)
tuples that pass the threshold. tuples that pass the threshold.
UpdatePromptRequest:
type: object
properties:
prompt:
type: string
description: The updated prompt text content.
version:
type: integer
description: >-
The current version of the prompt being updated.
variables:
type: array
items:
type: string
description: >-
Updated list of variable names that can be used in the prompt template.
set_as_default:
type: boolean
description: >-
Set the new version as the default (default=True).
additionalProperties: false
required:
- prompt
- version
- set_as_default
title: UpdatePromptRequest
VersionInfo: VersionInfo:
type: object type: object
properties: properties:
@ -13029,6 +13357,9 @@ tags:
- name: Inspect - name: Inspect
- name: Models - name: Models
- name: PostTraining (Coming Soon) - name: PostTraining (Coming Soon)
- name: Prompts
x-displayName: >-
Protocol for prompt management operations.
- name: Providers - name: Providers
x-displayName: >- x-displayName: >-
Providers API for inspecting, listing, and modifying providers and their configurations. Providers API for inspecting, listing, and modifying providers and their configurations.
@ -13056,6 +13387,7 @@ x-tagGroups:
- Inspect - Inspect
- Models - Models
- PostTraining (Coming Soon) - PostTraining (Coming Soon)
- Prompts
- Providers - Providers
- Safety - Safety
- Scoring - Scoring

View file

@ -93,10 +93,31 @@ chunks_response = client.vector_io.query(
### Using the RAG Tool ### Using the RAG Tool
> **⚠️ DEPRECATION NOTICE**: The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search
> API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
[appendix](#more-ragdocument-examples). [appendix](#more-ragdocument-examples).
#### OpenAI API Integration & Migration
The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits:
- **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints
- **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies
- **Error Resilience:** When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
**Migration Path:**
We recommend migrating to the OpenAI-compatible Search API for:
1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
2**Future-Proof**: Continued support and feature development
3**Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API
The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes.
However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any
documents fail to process, they will be logged in the response but will not cause the entire operation to fail.
```python ```python
from llama_stack_client import RAGDocument from llama_stack_client import RAGDocument

View file

@ -131,6 +131,7 @@ html_static_path = ["../_static"]
def setup(app): def setup(app):
app.add_css_file("css/my_theme.css") app.add_css_file("css/my_theme.css")
app.add_js_file("js/detect_theme.js") app.add_js_file("js/detect_theme.js")
app.add_js_file("js/horizontal_nav.js")
app.add_js_file("js/keyboard_shortcuts.js") app.add_js_file("js/keyboard_shortcuts.js")
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]): def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):

View file

@ -35,5 +35,5 @@ testing/record-replay
### Benchmarking ### Benchmarking
```{include} ../../../docs/source/distributions/k8s-benchmark/README.md ```{include} ../../../benchmarking/k8s-benchmark/README.md
``` ```

View file

@ -40,18 +40,15 @@ The system patches OpenAI and Ollama client methods to intercept calls before th
### Storage Architecture ### Storage Architecture
Recordings use a two-tier storage system optimized for both speed and debuggability: Recordings are stored as JSON files in the recording directory. They are looked up by their request hash.
``` ```
recordings/ recordings/
├── index.sqlite # Fast lookup by request hash
└── responses/ └── responses/
├── abc123def456.json # Individual response files ├── abc123def456.json # Individual response files
└── def789ghi012.json └── def789ghi012.json
``` ```
**SQLite index** enables O(log n) hash lookups and metadata queries without loading response bodies.
**JSON files** store complete request/response pairs in human-readable format for debugging. **JSON files** store complete request/response pairs in human-readable format for debugging.
## Recording Modes ## Recording Modes
@ -166,8 +163,8 @@ This preserves type safety - when replayed, you get the same Pydantic objects wi
Control recording behavior globally: Control recording behavior globally:
```bash ```bash
export LLAMA_STACK_TEST_INFERENCE_MODE=replay export LLAMA_STACK_TEST_INFERENCE_MODE=replay # this is the default
export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings export LLAMA_STACK_TEST_RECORDING_DIR=/path/to/recordings # default is tests/integration/recordings
pytest tests/integration/ pytest tests/integration/
``` ```

View file

@ -354,6 +354,47 @@ You can easily validate a request by running:
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
``` ```
#### Kubernetes Authentication Provider
The server can be configured to use Kubernetes SelfSubjectReview API to validate tokens directly against the Kubernetes API server:
```yaml
server:
auth:
provider_config:
type: "kubernetes"
api_server_url: "https://kubernetes.default.svc"
claims_mapping:
username: "roles"
groups: "roles"
uid: "uid_attr"
verify_tls: true
tls_cafile: "/path/to/ca.crt"
```
Configuration options:
- `api_server_url`: The Kubernetes API server URL (e.g., https://kubernetes.default.svc:6443)
- `verify_tls`: Whether to verify TLS certificates (default: true)
- `tls_cafile`: Path to CA certificate file for TLS verification
- `claims_mapping`: Mapping of Kubernetes user claims to access attributes
The provider validates tokens by sending a SelfSubjectReview request to the Kubernetes API server at `/apis/authentication.k8s.io/v1/selfsubjectreviews`. The provider extracts user information from the response:
- Username from the `userInfo.username` field
- Groups from the `userInfo.groups` field
- UID from the `userInfo.uid` field
To obtain a token for testing:
```bash
kubectl create namespace llama-stack
kubectl create serviceaccount llama-stack-auth -n llama-stack
kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
```
You can validate a request by running:
```bash
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
```
#### GitHub Token Provider #### GitHub Token Provider
Validates GitHub personal access tokens or OAuth tokens directly: Validates GitHub personal access tokens or OAuth tokens directly:
```yaml ```yaml

View file

@ -1,137 +1,55 @@
apiVersion: v1 apiVersion: v1
data: data:
stack_run_config.yaml: | stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n-
version: '2' inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n
image_name: kubernetes-demo \ inference:\n - provider_id: vllm-inference\n provider_type: remote::vllm\n
apis: \ config:\n url: ${env.VLLM_URL:=http://localhost:8000/v1}\n max_tokens:
- agents ${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n tls_verify:
- inference ${env.VLLM_TLS_VERIFY:=true}\n - provider_id: vllm-safety\n provider_type:
- safety remote::vllm\n config:\n url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n
- telemetry \ max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n
- tool_runtime \ tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n - provider_id: sentence-transformers\n
- vector_io \ provider_type: inline::sentence-transformers\n config: {}\n vector_io:\n
providers: \ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n provider_type: remote::chromadb\n
inference: \ config:\n url: ${env.CHROMADB_URL:=}\n kvstore:\n type: postgres\n
- provider_id: vllm-inference \ host: ${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n
provider_type: remote::vllm \ db: ${env.POSTGRES_DB:=llamastack}\n user: ${env.POSTGRES_USER:=llamastack}\n
config: \ password: ${env.POSTGRES_PASSWORD:=llamastack}\n files:\n - provider_id:
url: ${env.VLLM_URL:=http://localhost:8000/v1} meta-reference-files\n provider_type: inline::localfs\n config:\n storage_dir:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096} ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n metadata_store:\n
api_token: ${env.VLLM_API_TOKEN:=fake} \ type: sqlite\n db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
tls_verify: ${env.VLLM_TLS_VERIFY:=true} \ \n safety:\n - provider_id: llama-guard\n provider_type: inline::llama-guard\n
- provider_id: vllm-safety \ config:\n excluded_categories: []\n agents:\n - provider_id: meta-reference\n
provider_type: remote::vllm \ provider_type: inline::meta-reference\n config:\n persistence_store:\n
config: \ type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n port:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1} ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096} ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
api_token: ${env.VLLM_API_TOKEN:=fake} \ responses_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n
tls_verify: ${env.VLLM_TLS_VERIFY:=true} \ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n
- provider_id: sentence-transformers \ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
provider_type: inline::sentence-transformers \ telemetry:\n - provider_id: meta-reference\n provider_type: inline::meta-reference\n
config: {} \ config:\n service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n sinks:
vector_io: ${env.TELEMETRY_SINKS:=console}\n tool_runtime:\n - provider_id: brave-search\n
- provider_id: ${env.ENABLE_CHROMADB:+chromadb} \ provider_type: remote::brave-search\n config:\n api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n
provider_type: remote::chromadb \ max_results: 3\n - provider_id: tavily-search\n provider_type: remote::tavily-search\n
config: \ config:\n api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n max_results:
url: ${env.CHROMADB_URL:=} 3\n - provider_id: rag-runtime\n provider_type: inline::rag-runtime\n config:
kvstore: {}\n - provider_id: model-context-protocol\n provider_type: remote::model-context-protocol\n
type: postgres \ config: {}\nmetadata_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n
host: ${env.POSTGRES_HOST:=localhost} \ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user:
port: ${env.POSTGRES_PORT:=5432} ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n
db: ${env.POSTGRES_DB:=llamastack} \ table_name: llamastack_kvstore\ninference_store:\n type: postgres\n host:
user: ${env.POSTGRES_USER:=llamastack} ${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n
password: ${env.POSTGRES_PASSWORD:=llamastack} \ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n-
safety: metadata:\n embedding_dimension: 384\n model_id: all-MiniLM-L6-v2\n provider_id:
- provider_id: llama-guard sentence-transformers\n model_type: embedding\n- metadata: {}\n model_id: ${env.INFERENCE_MODEL}\n
provider_type: inline::llama-guard \ provider_id: vllm-inference\n model_type: llm\n- metadata: {}\n model_id:
config: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n provider_id: vllm-safety\n
excluded_categories: [] \ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs:
agents: []\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id:
- provider_id: meta-reference builtin::websearch\n provider_id: tavily-search\n- toolgroup_id: builtin::rag\n
provider_type: inline::meta-reference \ provider_id: rag-runtime\nserver:\n port: 8321\n auth:\n provider_config:\n
config: \ type: github_token\n"
persistence_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
responses_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
metadata_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
table_name: llamastack_kvstore
inference_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
models:
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- metadata: {}
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
provider_id: vllm-safety
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
server:
port: 8321
auth:
provider_config:
type: github_token
kind: ConfigMap kind: ConfigMap
metadata: metadata:
creationTimestamp: null creationTimestamp: null

View file

@ -3,6 +3,7 @@ image_name: kubernetes-demo
apis: apis:
- agents - agents
- inference - inference
- files
- safety - safety
- telemetry - telemetry
- tool_runtime - tool_runtime
@ -38,6 +39,14 @@ providers:
db: ${env.POSTGRES_DB:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack} user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack}
files:
- provider_id: meta-reference-files
provider_type: inline::localfs
config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard

View file

@ -18,12 +18,13 @@ embedding_model_id = (
).identifier ).identifier
embedding_dimension = em.metadata["embedding_dimension"] embedding_dimension = em.metadata["embedding_dimension"]
_ = client.vector_dbs.register( vector_db = client.vector_dbs.register(
vector_db_id=vector_db_id, vector_db_id=vector_db_id,
embedding_model=embedding_model_id, embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension, embedding_dimension=embedding_dimension,
provider_id="faiss", provider_id="faiss",
) )
vector_db_id = vector_db.identifier
source = "https://www.paulgraham.com/greatwork.html" source = "https://www.paulgraham.com/greatwork.html"
print("rag_tool> Ingesting document:", source) print("rag_tool> Ingesting document:", source)
document = RAGDocument( document = RAGDocument(
@ -35,7 +36,7 @@ document = RAGDocument(
client.tool_runtime.rag_tool.insert( client.tool_runtime.rag_tool.insert(
documents=[document], documents=[document],
vector_db_id=vector_db_id, vector_db_id=vector_db_id,
chunk_size_in_tokens=50, chunk_size_in_tokens=100,
) )
agent = Agent( agent = Agent(
client, client,

View file

@ -7,4 +7,5 @@ Here's a list of known external providers that you can use with Llama Stack:
| KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) | | KubeFlow Training | Train models with KubeFlow | Post Training | Remote | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) |
| KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) | | KubeFlow Pipelines | Train models with KubeFlow Pipelines | Post Training | Inline **and** Remote | [llama-stack-provider-kfp-trainer](https://github.com/opendatahub-io/llama-stack-provider-kfp-trainer) |
| RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) | | RamaLama | Inference models with RamaLama | Inference | Remote | [ramalama-stack](https://github.com/containers/ramalama-stack) |
| TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) | | TrustyAI LM-Eval | Evaluate models with TrustyAI LM-Eval | Eval | Remote | [llama-stack-provider-lmeval](https://github.com/trustyai-explainability/llama-stack-provider-lmeval) |
| MongoDB | VectorIO with MongoDB | Vector_IO | Remote | [mongodb-llama-stack](https://github.com/mongodb-partners/mongodb-llama-stack) |

View file

@ -18,6 +18,7 @@ This section contains documentation for all available providers for the **infere
inline_meta-reference inline_meta-reference
inline_sentence-transformers inline_sentence-transformers
remote_anthropic remote_anthropic
remote_azure
remote_bedrock remote_bedrock
remote_cerebras remote_cerebras
remote_databricks remote_databricks

View file

@ -0,0 +1,29 @@
# remote::azure
## Description
Azure OpenAI inference provider for accessing GPT models and other Azure services.
Provider documentation
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `api_key` | `<class 'pydantic.types.SecretStr'>` | No | | Azure API key for Azure |
| `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
| `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) |
| `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |
## Sample Configuration
```yaml
api_key: ${env.AZURE_API_KEY:=}
api_base: ${env.AZURE_API_BASE:=}
api_version: ${env.AZURE_API_VERSION:=}
api_type: ${env.AZURE_API_TYPE:=}
```

View file

@ -15,8 +15,8 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
| `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE | | `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
| `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS | | `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
| `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE | | `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. | | `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. | | `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
| `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). | | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
## Sample Configuration ## Sample Configuration

View file

@ -15,8 +15,8 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
| `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE | | `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
| `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS | | `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
| `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE | | `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
| `connect_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. | | `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
| `read_timeout` | `float \| None` | No | 60 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. | | `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
| `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). | | `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
## Sample Configuration ## Sample Configuration

View file

@ -79,3 +79,10 @@ class ConflictError(ValueError):
def __init__(self, message: str) -> None: def __init__(self, message: str) -> None:
super().__init__(message) super().__init__(message)
class TokenValidationError(ValueError):
"""raised when token validation fails during authentication"""
def __init__(self, message: str) -> None:
super().__init__(message)

View file

@ -102,6 +102,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
:cvar benchmarks: Benchmark suite management :cvar benchmarks: Benchmark suite management
:cvar tool_groups: Tool group organization :cvar tool_groups: Tool group organization
:cvar files: File storage and management :cvar files: File storage and management
:cvar prompts: Prompt versions and management
:cvar inspect: Built-in system inspection and introspection :cvar inspect: Built-in system inspection and introspection
""" """
@ -127,6 +128,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
benchmarks = "benchmarks" benchmarks = "benchmarks"
tool_groups = "tool_groups" tool_groups = "tool_groups"
files = "files" files = "files"
prompts = "prompts"
# built-in API # built-in API
inspect = "inspect" inspect = "inspect"

View file

@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .prompts import ListPromptsResponse, Prompt, Prompts
__all__ = ["Prompt", "Prompts", "ListPromptsResponse"]

View file

@ -0,0 +1,189 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import re
import secrets
from typing import Protocol, runtime_checkable
from pydantic import BaseModel, Field, field_validator, model_validator
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type
class Prompt(BaseModel):
"""A prompt resource representing a stored OpenAI Compatible prompt template in Llama Stack.
:param prompt: The system prompt text with variable placeholders. Variables are only supported when using the Responses API.
:param version: Version (integer starting at 1, incremented on save)
:param prompt_id: Unique identifier formatted as 'pmpt_<48-digit-hash>'
:param variables: List of prompt variable names that can be used in the prompt template
:param is_default: Boolean indicating whether this version is the default version for this prompt
"""
prompt: str | None = Field(default=None, description="The system prompt with variable placeholders")
version: int = Field(description="Version (integer starting at 1, incremented on save)", ge=1)
prompt_id: str = Field(description="Unique identifier in format 'pmpt_<48-digit-hash>'")
variables: list[str] = Field(
default_factory=list, description="List of variable names that can be used in the prompt template"
)
is_default: bool = Field(
default=False, description="Boolean indicating whether this version is the default version"
)
@field_validator("prompt_id")
@classmethod
def validate_prompt_id(cls, prompt_id: str) -> str:
if not isinstance(prompt_id, str):
raise TypeError("prompt_id must be a string in format 'pmpt_<48-digit-hash>'")
if not prompt_id.startswith("pmpt_"):
raise ValueError("prompt_id must start with 'pmpt_' prefix")
hex_part = prompt_id[5:]
if len(hex_part) != 48:
raise ValueError("prompt_id must be in format 'pmpt_<48-digit-hash>' (48 lowercase hex chars)")
for char in hex_part:
if char not in "0123456789abcdef":
raise ValueError("prompt_id hex part must contain only lowercase hex characters [0-9a-f]")
return prompt_id
@field_validator("version")
@classmethod
def validate_version(cls, prompt_version: int) -> int:
if prompt_version < 1:
raise ValueError("version must be >= 1")
return prompt_version
@model_validator(mode="after")
def validate_prompt_variables(self):
"""Validate that all variables used in the prompt are declared in the variables list."""
if not self.prompt:
return self
prompt_variables = set(re.findall(r"{{\s*(\w+)\s*}}", self.prompt))
declared_variables = set(self.variables)
undeclared = prompt_variables - declared_variables
if undeclared:
raise ValueError(f"Prompt contains undeclared variables: {sorted(undeclared)}")
return self
@classmethod
def generate_prompt_id(cls) -> str:
# Generate 48 hex characters (24 bytes)
random_bytes = secrets.token_bytes(24)
hex_string = random_bytes.hex()
return f"pmpt_{hex_string}"
class ListPromptsResponse(BaseModel):
"""Response model to list prompts."""
data: list[Prompt]
@runtime_checkable
@trace_protocol
class Prompts(Protocol):
"""Protocol for prompt management operations."""
@webmethod(route="/prompts", method="GET")
async def list_prompts(self) -> ListPromptsResponse:
"""List all prompts.
:returns: A ListPromptsResponse containing all prompts.
"""
...
@webmethod(route="/prompts/{prompt_id}/versions", method="GET")
async def list_prompt_versions(
self,
prompt_id: str,
) -> ListPromptsResponse:
"""List all versions of a specific prompt.
:param prompt_id: The identifier of the prompt to list versions for.
:returns: A ListPromptsResponse containing all versions of the prompt.
"""
...
@webmethod(route="/prompts/{prompt_id}", method="GET")
async def get_prompt(
self,
prompt_id: str,
version: int | None = None,
) -> Prompt:
"""Get a prompt by its identifier and optional version.
:param prompt_id: The identifier of the prompt to get.
:param version: The version of the prompt to get (defaults to latest).
:returns: A Prompt resource.
"""
...
@webmethod(route="/prompts", method="POST")
async def create_prompt(
self,
prompt: str,
variables: list[str] | None = None,
) -> Prompt:
"""Create a new prompt.
:param prompt: The prompt text content with variable placeholders.
:param variables: List of variable names that can be used in the prompt template.
:returns: The created Prompt resource.
"""
...
@webmethod(route="/prompts/{prompt_id}", method="PUT")
async def update_prompt(
self,
prompt_id: str,
prompt: str,
version: int,
variables: list[str] | None = None,
set_as_default: bool = True,
) -> Prompt:
"""Update an existing prompt (increments version).
:param prompt_id: The identifier of the prompt to update.
:param prompt: The updated prompt text content.
:param version: The current version of the prompt being updated.
:param variables: Updated list of variable names that can be used in the prompt template.
:param set_as_default: Set the new version as the default (default=True).
:returns: The updated Prompt resource with incremented version.
"""
...
@webmethod(route="/prompts/{prompt_id}", method="DELETE")
async def delete_prompt(
self,
prompt_id: str,
) -> None:
"""Delete a prompt.
:param prompt_id: The identifier of the prompt to delete.
"""
...
@webmethod(route="/prompts/{prompt_id}/set-default-version", method="PUT")
async def set_default_version(
self,
prompt_id: str,
version: int,
) -> Prompt:
"""Set which version of a prompt should be the default in get_prompt (latest).
:param prompt_id: The identifier of the prompt.
:param version: The version to set as default.
:returns: The prompt with the specified version now set as default.
"""
...

View file

@ -19,6 +19,7 @@ class ResourceType(StrEnum):
benchmark = "benchmark" benchmark = "benchmark"
tool = "tool" tool = "tool"
tool_group = "tool_group" tool_group = "tool_group"
prompt = "prompt"
class Resource(BaseModel): class Resource(BaseModel):

View file

@ -45,6 +45,7 @@ from llama_stack.core.utils.dynamic import instantiate_class_type
from llama_stack.core.utils.exec import formulate_run_args, run_command from llama_stack.core.utils.exec import formulate_run_args, run_command
from llama_stack.core.utils.image_types import LlamaStackImageType from llama_stack.core.utils.image_types import LlamaStackImageType
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
from llama_stack.providers.utils.sqlstore.sqlstore import SqliteSqlStoreConfig
DISTRIBS_PATH = Path(__file__).parent.parent.parent / "distributions" DISTRIBS_PATH = Path(__file__).parent.parent.parent / "distributions"
@ -294,6 +295,12 @@ def _generate_run_config(
if build_config.external_providers_dir if build_config.external_providers_dir
else EXTERNAL_PROVIDERS_DIR, else EXTERNAL_PROVIDERS_DIR,
) )
if not run_config.inference_store:
run_config.inference_store = SqliteSqlStoreConfig(
**SqliteSqlStoreConfig.sample_run_config(
__distro_dir__=(DISTRIBS_BASE_DIR / image_name).as_posix(), db_name="inference_store.db"
)
)
# build providers dict # build providers dict
provider_registry = get_provider_registry(build_config) provider_registry = get_provider_registry(build_config)
for api in apis: for api in apis:

View file

@ -7,6 +7,7 @@
from enum import StrEnum from enum import StrEnum
from pathlib import Path from pathlib import Path
from typing import Annotated, Any, Literal, Self from typing import Annotated, Any, Literal, Self
from urllib.parse import urlparse
from pydantic import BaseModel, Field, field_validator, model_validator from pydantic import BaseModel, Field, field_validator, model_validator
@ -212,6 +213,7 @@ class AuthProviderType(StrEnum):
OAUTH2_TOKEN = "oauth2_token" OAUTH2_TOKEN = "oauth2_token"
GITHUB_TOKEN = "github_token" GITHUB_TOKEN = "github_token"
CUSTOM = "custom" CUSTOM = "custom"
KUBERNETES = "kubernetes"
class OAuth2TokenAuthConfig(BaseModel): class OAuth2TokenAuthConfig(BaseModel):
@ -282,8 +284,45 @@ class GitHubTokenAuthConfig(BaseModel):
) )
class KubernetesAuthProviderConfig(BaseModel):
"""Configuration for Kubernetes authentication provider."""
type: Literal[AuthProviderType.KUBERNETES] = AuthProviderType.KUBERNETES
api_server_url: str = Field(
default="https://kubernetes.default.svc",
description="Kubernetes API server URL (e.g., https://api.cluster.domain:6443)",
)
verify_tls: bool = Field(default=True, description="Whether to verify TLS certificates")
tls_cafile: Path | None = Field(default=None, description="Path to CA certificate file for TLS verification")
claims_mapping: dict[str, str] = Field(
default_factory=lambda: {
"username": "roles",
"groups": "roles",
},
description="Mapping of Kubernetes user claims to access attributes",
)
@field_validator("api_server_url")
@classmethod
def validate_api_server_url(cls, v):
parsed = urlparse(v)
if not parsed.scheme or not parsed.netloc:
raise ValueError(f"api_server_url must be a valid URL with scheme and host: {v}")
if parsed.scheme not in ["http", "https"]:
raise ValueError(f"api_server_url scheme must be http or https: {v}")
return v
@field_validator("claims_mapping")
@classmethod
def validate_claims_mapping(cls, v):
for key, value in v.items():
if not value:
raise ValueError(f"claims_mapping value cannot be empty: {key}")
return v
AuthProviderConfig = Annotated[ AuthProviderConfig = Annotated[
OAuth2TokenAuthConfig | GitHubTokenAuthConfig | CustomAuthConfig, OAuth2TokenAuthConfig | GitHubTokenAuthConfig | CustomAuthConfig | KubernetesAuthProviderConfig,
Field(discriminator="type"), Field(discriminator="type"),
] ]
@ -392,6 +431,12 @@ class ServerConfig(BaseModel):
) )
class InferenceStoreConfig(BaseModel):
sql_store_config: SqlStoreConfig
max_write_queue_size: int = Field(default=10000, description="Max queued writes for inference store")
num_writers: int = Field(default=4, description="Number of concurrent background writers")
class StackRunConfig(BaseModel): class StackRunConfig(BaseModel):
version: int = LLAMA_STACK_RUN_CONFIG_VERSION version: int = LLAMA_STACK_RUN_CONFIG_VERSION
@ -425,11 +470,12 @@ Configuration for the persistence store used by the distribution registry. If no
a default SQLite store will be used.""", a default SQLite store will be used.""",
) )
inference_store: SqlStoreConfig | None = Field( inference_store: InferenceStoreConfig | SqlStoreConfig | None = Field(
default=None, default=None,
description=""" description="""
Configuration for the persistence store used by the inference API. If not specified, Configuration for the persistence store used by the inference API. Can be either a
a default SQLite store will be used.""", InferenceStoreConfig (with queue tuning parameters) or a SqlStoreConfig (deprecated).
If not specified, a default SQLite store will be used.""",
) )
# registry of "resources" in the distribution # registry of "resources" in the distribution

View file

@ -10,7 +10,6 @@ import json
import logging # allow-direct-logging import logging # allow-direct-logging
import os import os
import sys import sys
from concurrent.futures import ThreadPoolExecutor
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
@ -148,7 +147,6 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
self.async_client = AsyncLlamaStackAsLibraryClient( self.async_client = AsyncLlamaStackAsLibraryClient(
config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal config_path_or_distro_name, custom_provider_registry, provider_data, skip_logger_removal
) )
self.pool_executor = ThreadPoolExecutor(max_workers=4)
self.provider_data = provider_data self.provider_data = provider_data
self.loop = asyncio.new_event_loop() self.loop = asyncio.new_event_loop()

View file

@ -0,0 +1,233 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import json
from typing import Any
from pydantic import BaseModel
from llama_stack.apis.prompts import ListPromptsResponse, Prompt, Prompts
from llama_stack.core.datatypes import StackRunConfig
from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
from llama_stack.providers.utils.kvstore import KVStore, kvstore_impl
from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
class PromptServiceConfig(BaseModel):
"""Configuration for the built-in prompt service.
:param run_config: Stack run configuration containing distribution info
"""
run_config: StackRunConfig
async def get_provider_impl(config: PromptServiceConfig, deps: dict[Any, Any]):
"""Get the prompt service implementation."""
impl = PromptServiceImpl(config, deps)
await impl.initialize()
return impl
class PromptServiceImpl(Prompts):
"""Built-in prompt service implementation using KVStore."""
def __init__(self, config: PromptServiceConfig, deps: dict[Any, Any]):
self.config = config
self.deps = deps
self.kvstore: KVStore
async def initialize(self) -> None:
kvstore_config = SqliteKVStoreConfig(
db_path=(DISTRIBS_BASE_DIR / self.config.run_config.image_name / "prompts.db").as_posix()
)
self.kvstore = await kvstore_impl(kvstore_config)
def _get_default_key(self, prompt_id: str) -> str:
"""Get the KVStore key that stores the default version number."""
return f"prompts:v1:{prompt_id}:default"
async def _get_prompt_key(self, prompt_id: str, version: int | None = None) -> str:
"""Get the KVStore key for prompt data, returning default version if applicable."""
if version:
return self._get_version_key(prompt_id, str(version))
default_key = self._get_default_key(prompt_id)
resolved_version = await self.kvstore.get(default_key)
if resolved_version is None:
raise ValueError(f"Prompt {prompt_id}:default not found")
return self._get_version_key(prompt_id, resolved_version)
def _get_version_key(self, prompt_id: str, version: str) -> str:
"""Get the KVStore key for a specific prompt version."""
return f"prompts:v1:{prompt_id}:{version}"
def _get_list_key_prefix(self) -> str:
"""Get the key prefix for listing prompts."""
return "prompts:v1:"
def _serialize_prompt(self, prompt: Prompt) -> str:
"""Serialize a prompt to JSON string for storage."""
return json.dumps(
{
"prompt_id": prompt.prompt_id,
"prompt": prompt.prompt,
"version": prompt.version,
"variables": prompt.variables or [],
"is_default": prompt.is_default,
}
)
def _deserialize_prompt(self, data: str) -> Prompt:
"""Deserialize a prompt from JSON string."""
obj = json.loads(data)
return Prompt(
prompt_id=obj["prompt_id"],
prompt=obj["prompt"],
version=obj["version"],
variables=obj.get("variables", []),
is_default=obj.get("is_default", False),
)
async def list_prompts(self) -> ListPromptsResponse:
"""List all prompts (default versions only)."""
prefix = self._get_list_key_prefix()
keys = await self.kvstore.keys_in_range(prefix, prefix + "\xff")
prompts = []
for key in keys:
if key.endswith(":default"):
try:
default_version = await self.kvstore.get(key)
if default_version:
prompt_id = key.replace(prefix, "").replace(":default", "")
version_key = self._get_version_key(prompt_id, default_version)
data = await self.kvstore.get(version_key)
if data:
prompt = self._deserialize_prompt(data)
prompts.append(prompt)
except (json.JSONDecodeError, KeyError):
continue
prompts.sort(key=lambda p: p.prompt_id or "", reverse=True)
return ListPromptsResponse(data=prompts)
async def get_prompt(self, prompt_id: str, version: int | None = None) -> Prompt:
"""Get a prompt by its identifier and optional version."""
key = await self._get_prompt_key(prompt_id, version)
data = await self.kvstore.get(key)
if data is None:
raise ValueError(f"Prompt {prompt_id}:{version if version else 'default'} not found")
return self._deserialize_prompt(data)
async def create_prompt(
self,
prompt: str,
variables: list[str] | None = None,
) -> Prompt:
"""Create a new prompt."""
if variables is None:
variables = []
prompt_obj = Prompt(
prompt_id=Prompt.generate_prompt_id(),
prompt=prompt,
version=1,
variables=variables,
)
version_key = self._get_version_key(prompt_obj.prompt_id, str(prompt_obj.version))
data = self._serialize_prompt(prompt_obj)
await self.kvstore.set(version_key, data)
default_key = self._get_default_key(prompt_obj.prompt_id)
await self.kvstore.set(default_key, str(prompt_obj.version))
return prompt_obj
async def update_prompt(
self,
prompt_id: str,
prompt: str,
version: int,
variables: list[str] | None = None,
set_as_default: bool = True,
) -> Prompt:
"""Update an existing prompt (increments version)."""
if version < 1:
raise ValueError("Version must be >= 1")
if variables is None:
variables = []
prompt_versions = await self.list_prompt_versions(prompt_id)
latest_prompt = max(prompt_versions.data, key=lambda x: int(x.version))
if version and latest_prompt.version != version:
raise ValueError(
f"'{version}' is not the latest prompt version for prompt_id='{prompt_id}'. Use the latest version '{latest_prompt.version}' in request."
)
current_version = latest_prompt.version if version is None else version
new_version = current_version + 1
updated_prompt = Prompt(prompt_id=prompt_id, prompt=prompt, version=new_version, variables=variables)
version_key = self._get_version_key(prompt_id, str(new_version))
data = self._serialize_prompt(updated_prompt)
await self.kvstore.set(version_key, data)
if set_as_default:
await self.set_default_version(prompt_id, new_version)
return updated_prompt
async def delete_prompt(self, prompt_id: str) -> None:
"""Delete a prompt and all its versions."""
await self.get_prompt(prompt_id)
prefix = f"prompts:v1:{prompt_id}:"
keys = await self.kvstore.keys_in_range(prefix, prefix + "\xff")
for key in keys:
await self.kvstore.delete(key)
async def list_prompt_versions(self, prompt_id: str) -> ListPromptsResponse:
"""List all versions of a specific prompt."""
prefix = f"prompts:v1:{prompt_id}:"
keys = await self.kvstore.keys_in_range(prefix, prefix + "\xff")
default_version = None
prompts = []
for key in keys:
data = await self.kvstore.get(key)
if key.endswith(":default"):
default_version = data
else:
if data:
prompt_obj = self._deserialize_prompt(data)
prompts.append(prompt_obj)
if not prompts:
raise ValueError(f"Prompt {prompt_id} not found")
for prompt in prompts:
prompt.is_default = str(prompt.version) == default_version
prompts.sort(key=lambda x: x.version)
return ListPromptsResponse(data=prompts)
async def set_default_version(self, prompt_id: str, version: int) -> Prompt:
"""Set which version of a prompt should be the default, If not set. the default is the latest."""
version_key = self._get_version_key(prompt_id, str(version))
data = await self.kvstore.get(version_key)
if data is None:
raise ValueError(f"Prompt {prompt_id} version {version} not found")
default_key = self._get_default_key(prompt_id)
await self.kvstore.set(default_key, str(version))
return self._deserialize_prompt(data)

View file

@ -19,6 +19,7 @@ from llama_stack.apis.inference import Inference, InferenceProvider
from llama_stack.apis.inspect import Inspect from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models from llama_stack.apis.models import Models
from llama_stack.apis.post_training import PostTraining from llama_stack.apis.post_training import PostTraining
from llama_stack.apis.prompts import Prompts
from llama_stack.apis.providers import Providers as ProvidersAPI from llama_stack.apis.providers import Providers as ProvidersAPI
from llama_stack.apis.safety import Safety from llama_stack.apis.safety import Safety
from llama_stack.apis.scoring import Scoring from llama_stack.apis.scoring import Scoring
@ -93,6 +94,7 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
Api.tool_groups: ToolGroups, Api.tool_groups: ToolGroups,
Api.tool_runtime: ToolRuntime, Api.tool_runtime: ToolRuntime,
Api.files: Files, Api.files: Files,
Api.prompts: Prompts,
} }
if external_apis: if external_apis:
@ -284,7 +286,15 @@ async def instantiate_providers(
if provider.provider_id is None: if provider.provider_id is None:
continue continue
deps = {a: impls[a] for a in provider.spec.api_dependencies} try:
deps = {a: impls[a] for a in provider.spec.api_dependencies}
except KeyError as e:
missing_api = e.args[0]
raise RuntimeError(
f"Failed to resolve '{provider.spec.api.value}' provider '{provider.provider_id}' of type '{provider.spec.provider_type}': "
f"required dependency '{missing_api.value}' is not available. "
f"Please add a '{missing_api.value}' provider to your configuration or check if the provider is properly configured."
) from e
for a in provider.spec.optional_api_dependencies: for a in provider.spec.optional_api_dependencies:
if a in impls: if a in impls:
deps[a] = impls[a] deps[a] = impls[a]

View file

@ -78,7 +78,10 @@ async def get_auto_router_impl(
# TODO: move pass configs to routers instead # TODO: move pass configs to routers instead
if api == Api.inference and run_config.inference_store: if api == Api.inference and run_config.inference_store:
inference_store = InferenceStore(run_config.inference_store, policy) inference_store = InferenceStore(
config=run_config.inference_store,
policy=policy,
)
await inference_store.initialize() await inference_store.initialize()
api_to_dep_impl["store"] = inference_store api_to_dep_impl["store"] = inference_store

View file

@ -63,7 +63,7 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat
from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack.models.llama.llama3.tokenizer import Tokenizer
from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable from llama_stack.providers.datatypes import HealthResponse, HealthStatus, RoutingTable
from llama_stack.providers.utils.inference.inference_store import InferenceStore from llama_stack.providers.utils.inference.inference_store import InferenceStore
from llama_stack.providers.utils.telemetry.tracing import get_current_span from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
logger = get_logger(name=__name__, category="core::routers") logger = get_logger(name=__name__, category="core::routers")
@ -90,6 +90,11 @@ class InferenceRouter(Inference):
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("InferenceRouter.shutdown") logger.debug("InferenceRouter.shutdown")
if self.store:
try:
await self.store.shutdown()
except Exception as e:
logger.warning(f"Error during InferenceStore shutdown: {e}")
async def register_model( async def register_model(
self, self,
@ -160,7 +165,7 @@ class InferenceRouter(Inference):
metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model) metrics = self._construct_metrics(prompt_tokens, completion_tokens, total_tokens, model)
if self.telemetry: if self.telemetry:
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics] return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
async def _count_tokens( async def _count_tokens(
@ -431,7 +436,7 @@ class InferenceRouter(Inference):
model=model_obj, model=model_obj,
) )
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
# these metrics will show up in the client response. # these metrics will show up in the client response.
response.metrics = ( response.metrics = (
@ -527,7 +532,7 @@ class InferenceRouter(Inference):
# Store the response with the ID that will be returned to the client # Store the response with the ID that will be returned to the client
if self.store: if self.store:
await self.store.store_chat_completion(response, messages) asyncio.create_task(self.store.store_chat_completion(response, messages))
if self.telemetry: if self.telemetry:
metrics = self._construct_metrics( metrics = self._construct_metrics(
@ -537,7 +542,7 @@ class InferenceRouter(Inference):
model=model_obj, model=model_obj,
) )
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
# these metrics will show up in the client response. # these metrics will show up in the client response.
response.metrics = ( response.metrics = (
metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
@ -664,7 +669,7 @@ class InferenceRouter(Inference):
"completion_tokens", "completion_tokens",
"total_tokens", "total_tokens",
]: # Only log completion and total tokens ]: # Only log completion and total tokens
await self.telemetry.log_event(metric) enqueue_event(metric)
# Return metrics in response # Return metrics in response
async_metrics = [ async_metrics = [
@ -710,7 +715,7 @@ class InferenceRouter(Inference):
) )
for metric in completion_metrics: for metric in completion_metrics:
if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens
await self.telemetry.log_event(metric) enqueue_event(metric)
# Return metrics in response # Return metrics in response
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics] return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
@ -755,7 +760,7 @@ class InferenceRouter(Inference):
choices_data[idx] = { choices_data[idx] = {
"content_parts": [], "content_parts": [],
"tool_calls_builder": {}, "tool_calls_builder": {},
"finish_reason": None, "finish_reason": "stop",
"logprobs_content_parts": [], "logprobs_content_parts": [],
} }
current_choice_data = choices_data[idx] current_choice_data = choices_data[idx]
@ -806,7 +811,7 @@ class InferenceRouter(Inference):
model=model, model=model,
) )
for metric in metrics: for metric in metrics:
await self.telemetry.log_event(metric) enqueue_event(metric)
yield chunk yield chunk
finally: finally:
@ -855,4 +860,4 @@ class InferenceRouter(Inference):
object="chat.completion", object="chat.completion",
) )
logger.debug(f"InferenceRouter.completion_response: {final_response}") logger.debug(f"InferenceRouter.completion_response: {final_response}")
await self.store.store_chat_completion(final_response, messages) asyncio.create_task(self.store.store_chat_completion(final_response, messages))

View file

@ -52,7 +52,6 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
provider_vector_db_id: str | None = None, provider_vector_db_id: str | None = None,
vector_db_name: str | None = None, vector_db_name: str | None = None,
) -> VectorDB: ) -> VectorDB:
provider_vector_db_id = provider_vector_db_id or vector_db_id
if provider_id is None: if provider_id is None:
if len(self.impls_by_provider_id) > 0: if len(self.impls_by_provider_id) > 0:
provider_id = list(self.impls_by_provider_id.keys())[0] provider_id = list(self.impls_by_provider_id.keys())[0]
@ -69,14 +68,33 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding) raise ModelTypeError(embedding_model, model.model_type, ModelType.embedding)
if "embedding_dimension" not in model.metadata: if "embedding_dimension" not in model.metadata:
raise ValueError(f"Model {embedding_model} does not have an embedding dimension") raise ValueError(f"Model {embedding_model} does not have an embedding dimension")
provider = self.impls_by_provider_id[provider_id]
logger.warning(
"VectorDB is being deprecated in future releases in favor of VectorStore. Please migrate your usage accordingly."
)
vector_store = await provider.openai_create_vector_store(
name=vector_db_name or vector_db_id,
embedding_model=embedding_model,
embedding_dimension=model.metadata["embedding_dimension"],
provider_id=provider_id,
provider_vector_db_id=provider_vector_db_id,
)
vector_store_id = vector_store.id
actual_provider_vector_db_id = provider_vector_db_id or vector_store_id
logger.warning(
f"Ignoring vector_db_id {vector_db_id} and using vector_store_id {vector_store_id} instead. Setting VectorDB {vector_db_id} to VectorDB.vector_db_name"
)
vector_db_data = { vector_db_data = {
"identifier": vector_db_id, "identifier": vector_store_id,
"type": ResourceType.vector_db.value, "type": ResourceType.vector_db.value,
"provider_id": provider_id, "provider_id": provider_id,
"provider_resource_id": provider_vector_db_id, "provider_resource_id": actual_provider_vector_db_id,
"embedding_model": embedding_model, "embedding_model": embedding_model,
"embedding_dimension": model.metadata["embedding_dimension"], "embedding_dimension": model.metadata["embedding_dimension"],
"vector_db_name": vector_db_name, "vector_db_name": vector_store.name,
} }
vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data) vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
await self.register_object(vector_db) await self.register_object(vector_db)

View file

@ -8,16 +8,18 @@ import ssl
import time import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from asyncio import Lock from asyncio import Lock
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urljoin, urlparse
import httpx import httpx
from jose import jwt from jose import jwt
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.common.errors import TokenValidationError
from llama_stack.core.datatypes import ( from llama_stack.core.datatypes import (
AuthenticationConfig, AuthenticationConfig,
CustomAuthConfig, CustomAuthConfig,
GitHubTokenAuthConfig, GitHubTokenAuthConfig,
KubernetesAuthProviderConfig,
OAuth2TokenAuthConfig, OAuth2TokenAuthConfig,
User, User,
) )
@ -162,7 +164,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
auth=auth, auth=auth,
timeout=10.0, # Add a reasonable timeout timeout=10.0, # Add a reasonable timeout
) )
if response.status_code != 200: if response.status_code != httpx.codes.OK:
logger.warning(f"Token introspection failed with status code: {response.status_code}") logger.warning(f"Token introspection failed with status code: {response.status_code}")
raise ValueError(f"Token introspection failed: {response.status_code}") raise ValueError(f"Token introspection failed: {response.status_code}")
@ -272,7 +274,7 @@ class CustomAuthProvider(AuthProvider):
json=auth_request.model_dump(), json=auth_request.model_dump(),
timeout=10.0, # Add a reasonable timeout timeout=10.0, # Add a reasonable timeout
) )
if response.status_code != 200: if response.status_code != httpx.codes.OK:
logger.warning(f"Authentication failed with status code: {response.status_code}") logger.warning(f"Authentication failed with status code: {response.status_code}")
raise ValueError(f"Authentication failed: {response.status_code}") raise ValueError(f"Authentication failed: {response.status_code}")
@ -374,6 +376,89 @@ async def _get_github_user_info(access_token: str, github_api_base_url: str) ->
} }
class KubernetesAuthProvider(AuthProvider):
"""
Kubernetes authentication provider that validates tokens using the Kubernetes SelfSubjectReview API.
This provider integrates with Kubernetes API server by using the
/apis/authentication.k8s.io/v1/selfsubjectreviews endpoint to validate tokens and extract user information.
"""
def __init__(self, config: KubernetesAuthProviderConfig):
self.config = config
def _httpx_verify_value(self) -> bool | str:
"""
Build the value for httpx's `verify` parameter.
- False disables verification.
- Path string points to a CA bundle.
- True uses system defaults.
"""
if not self.config.verify_tls:
return False
if self.config.tls_cafile:
return self.config.tls_cafile.as_posix()
return True
async def validate_token(self, token: str, scope: dict | None = None) -> User:
"""Validate a token using Kubernetes SelfSubjectReview API endpoint."""
# Build the Kubernetes SelfSubjectReview API endpoint URL
review_api_url = urljoin(self.config.api_server_url, "/apis/authentication.k8s.io/v1/selfsubjectreviews")
# Create SelfSubjectReview request body
review_request = {"apiVersion": "authentication.k8s.io/v1", "kind": "SelfSubjectReview"}
verify = self._httpx_verify_value()
try:
async with httpx.AsyncClient(verify=verify, timeout=10.0) as client:
response = await client.post(
review_api_url,
json=review_request,
headers={
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
},
)
if response.status_code == httpx.codes.UNAUTHORIZED:
raise TokenValidationError("Invalid token")
if response.status_code != httpx.codes.CREATED:
logger.warning(f"Kubernetes SelfSubjectReview API failed with status code: {response.status_code}")
raise TokenValidationError(f"Token validation failed: {response.status_code}")
review_response = response.json()
# Extract user information from SelfSubjectReview response
status = review_response.get("status", {})
if not status:
raise ValueError("No status found in SelfSubjectReview response")
user_info = status.get("userInfo", {})
if not user_info:
raise ValueError("No userInfo found in SelfSubjectReview response")
username = user_info.get("username")
if not username:
raise ValueError("No username found in SelfSubjectReview response")
# Build user attributes from Kubernetes user info
user_attributes = get_attributes_from_claims(user_info, self.config.claims_mapping)
return User(
principal=username,
attributes=user_attributes,
)
except httpx.TimeoutException:
logger.warning("Kubernetes SelfSubjectReview API request timed out")
raise ValueError("Token validation timeout") from None
except Exception as e:
logger.warning(f"Error during token validation: {str(e)}")
raise ValueError(f"Token validation error: {str(e)}") from e
async def close(self):
"""Close any resources."""
pass
def create_auth_provider(config: AuthenticationConfig) -> AuthProvider: def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
"""Factory function to create the appropriate auth provider.""" """Factory function to create the appropriate auth provider."""
provider_config = config.provider_config provider_config = config.provider_config
@ -384,5 +469,7 @@ def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
return OAuth2TokenAuthProvider(provider_config) return OAuth2TokenAuthProvider(provider_config)
elif isinstance(provider_config, GitHubTokenAuthConfig): elif isinstance(provider_config, GitHubTokenAuthConfig):
return GitHubTokenAuthProvider(provider_config) return GitHubTokenAuthProvider(provider_config)
elif isinstance(provider_config, KubernetesAuthProviderConfig):
return KubernetesAuthProvider(provider_config)
else: else:
raise ValueError(f"Unknown authentication provider config type: {type(provider_config)}") raise ValueError(f"Unknown authentication provider config type: {type(provider_config)}")

View file

@ -132,15 +132,17 @@ def translate_exception(exc: Exception) -> HTTPException | RequestValidationErro
}, },
) )
elif isinstance(exc, ConflictError): elif isinstance(exc, ConflictError):
return HTTPException(status_code=409, detail=str(exc)) return HTTPException(status_code=httpx.codes.CONFLICT, detail=str(exc))
elif isinstance(exc, ResourceNotFoundError): elif isinstance(exc, ResourceNotFoundError):
return HTTPException(status_code=404, detail=str(exc)) return HTTPException(status_code=httpx.codes.NOT_FOUND, detail=str(exc))
elif isinstance(exc, ValueError): elif isinstance(exc, ValueError):
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}") return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=f"Invalid value: {str(exc)}")
elif isinstance(exc, BadRequestError): elif isinstance(exc, BadRequestError):
return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc)) return HTTPException(status_code=httpx.codes.BAD_REQUEST, detail=str(exc))
elif isinstance(exc, PermissionError | AccessDeniedError): elif isinstance(exc, PermissionError | AccessDeniedError):
return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}") return HTTPException(status_code=httpx.codes.FORBIDDEN, detail=f"Permission denied: {str(exc)}")
elif isinstance(exc, ConnectionError | httpx.ConnectError):
return HTTPException(status_code=httpx.codes.BAD_GATEWAY, detail=str(exc))
elif isinstance(exc, asyncio.TimeoutError | TimeoutError): elif isinstance(exc, asyncio.TimeoutError | TimeoutError):
return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}") return HTTPException(status_code=httpx.codes.GATEWAY_TIMEOUT, detail=f"Operation timed out: {str(exc)}")
elif isinstance(exc, NotImplementedError): elif isinstance(exc, NotImplementedError):
@ -513,6 +515,7 @@ def main(args: argparse.Namespace | None = None):
apis_to_serve.add("inspect") apis_to_serve.add("inspect")
apis_to_serve.add("providers") apis_to_serve.add("providers")
apis_to_serve.add("prompts")
for api_str in apis_to_serve: for api_str in apis_to_serve:
api = Api(api_str) api = Api(api_str)

View file

@ -24,6 +24,7 @@ from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models from llama_stack.apis.models import Models
from llama_stack.apis.post_training import PostTraining from llama_stack.apis.post_training import PostTraining
from llama_stack.apis.prompts import Prompts
from llama_stack.apis.providers import Providers from llama_stack.apis.providers import Providers
from llama_stack.apis.safety import Safety from llama_stack.apis.safety import Safety
from llama_stack.apis.scoring import Scoring from llama_stack.apis.scoring import Scoring
@ -37,6 +38,7 @@ from llama_stack.apis.vector_io import VectorIO
from llama_stack.core.datatypes import Provider, StackRunConfig from llama_stack.core.datatypes import Provider, StackRunConfig
from llama_stack.core.distribution import get_provider_registry from llama_stack.core.distribution import get_provider_registry
from llama_stack.core.inspect import DistributionInspectConfig, DistributionInspectImpl from llama_stack.core.inspect import DistributionInspectConfig, DistributionInspectImpl
from llama_stack.core.prompts.prompts import PromptServiceConfig, PromptServiceImpl
from llama_stack.core.providers import ProviderImpl, ProviderImplConfig from llama_stack.core.providers import ProviderImpl, ProviderImplConfig
from llama_stack.core.resolver import ProviderRegistry, resolve_impls from llama_stack.core.resolver import ProviderRegistry, resolve_impls
from llama_stack.core.routing_tables.common import CommonRoutingTableImpl from llama_stack.core.routing_tables.common import CommonRoutingTableImpl
@ -72,6 +74,7 @@ class LlamaStack(
ToolRuntime, ToolRuntime,
RAGToolRuntime, RAGToolRuntime,
Files, Files,
Prompts,
): ):
pass pass
@ -305,6 +308,12 @@ def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConf
) )
impls[Api.providers] = providers_impl impls[Api.providers] = providers_impl
prompts_impl = PromptServiceImpl(
PromptServiceConfig(run_config=run_config),
deps=impls,
)
impls[Api.prompts] = prompts_impl
# Produces a stack of providers for the given run config. Not all APIs may be # Produces a stack of providers for the given run config. Not all APIs may be
# asked for in the run config. # asked for in the run config.
@ -329,6 +338,9 @@ async def construct_stack(
# Add internal implementations after all other providers are resolved # Add internal implementations after all other providers are resolved
add_internal_implementations(impls, run_config) add_internal_implementations(impls, run_config)
if Api.prompts in impls:
await impls[Api.prompts].initialize()
await register_resources(run_config, impls) await register_resources(run_config, impls)
await refresh_registry_once(impls) await refresh_registry_once(impls)

View file

@ -17,6 +17,7 @@ distribution_spec:
- provider_type: remote::vertexai - provider_type: remote::vertexai
- provider_type: remote::groq - provider_type: remote::groq
- provider_type: remote::sambanova - provider_type: remote::sambanova
- provider_type: remote::azure
- provider_type: inline::sentence-transformers - provider_type: inline::sentence-transformers
vector_io: vector_io:
- provider_type: inline::faiss - provider_type: inline::faiss

View file

@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
def get_distribution_template() -> DistributionTemplate: def get_distribution_template() -> DistributionTemplate:
template = get_starter_distribution_template() template = get_starter_distribution_template(name="ci-tests")
name = "ci-tests"
template.name = name
template.description = "CI tests for Llama Stack" template.description = "CI tests for Llama Stack"
return template return template

View file

@ -81,6 +81,13 @@ providers:
config: config:
url: https://api.sambanova.ai/v1 url: https://api.sambanova.ai/v1
api_key: ${env.SAMBANOVA_API_KEY:=} api_key: ${env.SAMBANOVA_API_KEY:=}
- provider_id: ${env.AZURE_API_KEY:+azure}
provider_type: remote::azure
config:
api_key: ${env.AZURE_API_KEY:=}
api_base: ${env.AZURE_API_BASE:=}
api_version: ${env.AZURE_API_VERSION:=}
api_type: ${env.AZURE_API_TYPE:=}
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
vector_io: vector_io:
@ -89,28 +96,28 @@ providers:
config: config:
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/faiss_store.db
- provider_id: sqlite-vec - provider_id: sqlite-vec
provider_type: inline::sqlite-vec provider_type: inline::sqlite-vec
config: config:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec.db
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec_registry.db
- provider_id: ${env.MILVUS_URL:+milvus} - provider_id: ${env.MILVUS_URL:+milvus}
provider_type: inline::milvus provider_type: inline::milvus
config: config:
db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/ci-tests}/milvus.db
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/milvus_registry.db
- provider_id: ${env.CHROMADB_URL:+chromadb} - provider_id: ${env.CHROMADB_URL:+chromadb}
provider_type: remote::chromadb provider_type: remote::chromadb
config: config:
url: ${env.CHROMADB_URL:=} url: ${env.CHROMADB_URL:=}
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests/}/chroma_remote_registry.db
- provider_id: ${env.PGVECTOR_DB:+pgvector} - provider_id: ${env.PGVECTOR_DB:+pgvector}
provider_type: remote::pgvector provider_type: remote::pgvector
config: config:
@ -121,15 +128,15 @@ providers:
password: ${env.PGVECTOR_PASSWORD:=} password: ${env.PGVECTOR_PASSWORD:=}
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/pgvector_registry.db
files: files:
- provider_id: meta-reference-files - provider_id: meta-reference-files
provider_type: inline::localfs provider_type: inline::localfs
config: config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files} storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ci-tests/files}
metadata_store: metadata_store:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/files_metadata.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard

View file

@ -18,6 +18,7 @@ distribution_spec:
- provider_type: remote::vertexai - provider_type: remote::vertexai
- provider_type: remote::groq - provider_type: remote::groq
- provider_type: remote::sambanova - provider_type: remote::sambanova
- provider_type: remote::azure
- provider_type: inline::sentence-transformers - provider_type: inline::sentence-transformers
vector_io: vector_io:
- provider_type: inline::faiss - provider_type: inline::faiss

View file

@ -81,6 +81,13 @@ providers:
config: config:
url: https://api.sambanova.ai/v1 url: https://api.sambanova.ai/v1
api_key: ${env.SAMBANOVA_API_KEY:=} api_key: ${env.SAMBANOVA_API_KEY:=}
- provider_id: ${env.AZURE_API_KEY:+azure}
provider_type: remote::azure
config:
api_key: ${env.AZURE_API_KEY:=}
api_base: ${env.AZURE_API_BASE:=}
api_version: ${env.AZURE_API_VERSION:=}
api_type: ${env.AZURE_API_TYPE:=}
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
vector_io: vector_io:
@ -89,28 +96,28 @@ providers:
config: config:
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/faiss_store.db
- provider_id: sqlite-vec - provider_id: sqlite-vec
provider_type: inline::sqlite-vec provider_type: inline::sqlite-vec
config: config:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec.db
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/sqlite_vec_registry.db
- provider_id: ${env.MILVUS_URL:+milvus} - provider_id: ${env.MILVUS_URL:+milvus}
provider_type: inline::milvus provider_type: inline::milvus
config: config:
db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter-gpu}/milvus.db
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/milvus_registry.db
- provider_id: ${env.CHROMADB_URL:+chromadb} - provider_id: ${env.CHROMADB_URL:+chromadb}
provider_type: remote::chromadb provider_type: remote::chromadb
config: config:
url: ${env.CHROMADB_URL:=} url: ${env.CHROMADB_URL:=}
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu/}/chroma_remote_registry.db
- provider_id: ${env.PGVECTOR_DB:+pgvector} - provider_id: ${env.PGVECTOR_DB:+pgvector}
provider_type: remote::pgvector provider_type: remote::pgvector
config: config:
@ -121,15 +128,15 @@ providers:
password: ${env.PGVECTOR_PASSWORD:=} password: ${env.PGVECTOR_PASSWORD:=}
kvstore: kvstore:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/pgvector_registry.db
files: files:
- provider_id: meta-reference-files - provider_id: meta-reference-files
provider_type: inline::localfs provider_type: inline::localfs
config: config:
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files} storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter-gpu/files}
metadata_store: metadata_store:
type: sqlite type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter-gpu}/files_metadata.db
safety: safety:
- provider_id: llama-guard - provider_id: llama-guard
provider_type: inline::llama-guard provider_type: inline::llama-guard

View file

@ -11,9 +11,7 @@ from ..starter.starter import get_distribution_template as get_starter_distribut
def get_distribution_template() -> DistributionTemplate: def get_distribution_template() -> DistributionTemplate:
template = get_starter_distribution_template() template = get_starter_distribution_template(name="starter-gpu")
name = "starter-gpu"
template.name = name
template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments." template.description = "Quick start template for running Llama Stack with several popular providers. This distribution is intended for GPU-enabled environments."
template.providers["post_training"] = [ template.providers["post_training"] = [

View file

@ -18,6 +18,7 @@ distribution_spec:
- provider_type: remote::vertexai - provider_type: remote::vertexai
- provider_type: remote::groq - provider_type: remote::groq
- provider_type: remote::sambanova - provider_type: remote::sambanova
- provider_type: remote::azure
- provider_type: inline::sentence-transformers - provider_type: inline::sentence-transformers
vector_io: vector_io:
- provider_type: inline::faiss - provider_type: inline::faiss

View file

@ -81,6 +81,13 @@ providers:
config: config:
url: https://api.sambanova.ai/v1 url: https://api.sambanova.ai/v1
api_key: ${env.SAMBANOVA_API_KEY:=} api_key: ${env.SAMBANOVA_API_KEY:=}
- provider_id: ${env.AZURE_API_KEY:+azure}
provider_type: remote::azure
config:
api_key: ${env.AZURE_API_KEY:=}
api_base: ${env.AZURE_API_BASE:=}
api_version: ${env.AZURE_API_VERSION:=}
api_type: ${env.AZURE_API_TYPE:=}
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
vector_io: vector_io:

View file

@ -59,6 +59,7 @@ ENABLED_INFERENCE_PROVIDERS = [
"cerebras", "cerebras",
"nvidia", "nvidia",
"bedrock", "bedrock",
"azure",
] ]
INFERENCE_PROVIDER_IDS = { INFERENCE_PROVIDER_IDS = {
@ -68,6 +69,7 @@ INFERENCE_PROVIDER_IDS = {
"cerebras": "${env.CEREBRAS_API_KEY:+cerebras}", "cerebras": "${env.CEREBRAS_API_KEY:+cerebras}",
"nvidia": "${env.NVIDIA_API_KEY:+nvidia}", "nvidia": "${env.NVIDIA_API_KEY:+nvidia}",
"vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}", "vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}",
"azure": "${env.AZURE_API_KEY:+azure}",
} }
@ -99,9 +101,8 @@ def get_remote_inference_providers() -> list[Provider]:
return inference_providers return inference_providers
def get_distribution_template() -> DistributionTemplate: def get_distribution_template(name: str = "starter") -> DistributionTemplate:
remote_inference_providers = get_remote_inference_providers() remote_inference_providers = get_remote_inference_providers()
name = "starter"
providers = { providers = {
"inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in remote_inference_providers] "inference": [BuildProvider(provider_type=p.provider_type, module=p.module) for p in remote_inference_providers]
@ -278,5 +279,21 @@ def get_distribution_template() -> DistributionTemplate:
"http://localhost:11434", "http://localhost:11434",
"Ollama URL", "Ollama URL",
), ),
"AZURE_API_KEY": (
"",
"Azure API Key",
),
"AZURE_API_BASE": (
"",
"Azure API Base",
),
"AZURE_API_VERSION": (
"",
"Azure API Version",
),
"AZURE_API_TYPE": (
"azure",
"Azure API Type",
),
}, },
) )

View file

@ -178,9 +178,9 @@ class ReferenceBatchesImpl(Batches):
# TODO: set expiration time for garbage collection # TODO: set expiration time for garbage collection
if endpoint not in ["/v1/chat/completions"]: if endpoint not in ["/v1/chat/completions", "/v1/completions"]:
raise ValueError( raise ValueError(
f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions. Code: invalid_value. Param: endpoint", f"Invalid endpoint: {endpoint}. Supported values: /v1/chat/completions, /v1/completions. Code: invalid_value. Param: endpoint",
) )
if completion_window != "24h": if completion_window != "24h":
@ -424,13 +424,21 @@ class ReferenceBatchesImpl(Batches):
) )
valid = False valid = False
for param, expected_type, type_string in [ if batch.endpoint == "/v1/chat/completions":
("model", str, "a string"), required_params = [
# messages is specific to /v1/chat/completions ("model", str, "a string"),
# we could skip validating messages here and let inference fail. however, # messages is specific to /v1/chat/completions
# that would be a very expensive way to find out messages is wrong. # we could skip validating messages here and let inference fail. however,
("messages", list, "an array"), # TODO: allow messages to be a string? # that would be a very expensive way to find out messages is wrong.
]: ("messages", list, "an array"), # TODO: allow messages to be a string?
]
else: # /v1/completions
required_params = [
("model", str, "a string"),
("prompt", str, "a string"), # TODO: allow prompt to be a list of strings??
]
for param, expected_type, type_string in required_params:
if param not in body: if param not in body:
errors.append( errors.append(
BatchError( BatchError(
@ -591,20 +599,37 @@ class ReferenceBatchesImpl(Batches):
try: try:
# TODO(SECURITY): review body for security issues # TODO(SECURITY): review body for security issues
request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]] if request.url == "/v1/chat/completions":
chat_response = await self.inference_api.openai_chat_completion(**request.body) request.body["messages"] = [convert_to_openai_message_param(msg) for msg in request.body["messages"]]
chat_response = await self.inference_api.openai_chat_completion(**request.body)
# this is for mypy, we don't allow streaming so we'll get the right type # this is for mypy, we don't allow streaming so we'll get the right type
assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method" assert hasattr(chat_response, "model_dump_json"), "Chat response must have model_dump_json method"
return { return {
"id": request_id, "id": request_id,
"custom_id": request.custom_id, "custom_id": request.custom_id,
"response": { "response": {
"status_code": 200, "status_code": 200,
"request_id": request_id, # TODO: should this be different? "request_id": request_id, # TODO: should this be different?
"body": chat_response.model_dump_json(), "body": chat_response.model_dump_json(),
}, },
} }
else: # /v1/completions
completion_response = await self.inference_api.openai_completion(**request.body)
# this is for mypy, we don't allow streaming so we'll get the right type
assert hasattr(completion_response, "model_dump_json"), (
"Completion response must have model_dump_json method"
)
return {
"id": request_id,
"custom_id": request.custom_id,
"response": {
"status_code": 200,
"request_id": request_id,
"body": completion_response.model_dump_json(),
},
}
except Exception as e: except Exception as e:
logger.info(f"Error processing request {request.custom_id} in batch {batch_id}: {e}") logger.info(f"Error processing request {request.custom_id} in batch {batch_id}: {e}")
return { return {

View file

@ -14,6 +14,6 @@ from .config import RagToolRuntimeConfig
async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]): async def get_provider_impl(config: RagToolRuntimeConfig, deps: dict[Api, Any]):
from .memory import MemoryToolRuntimeImpl from .memory import MemoryToolRuntimeImpl
impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference]) impl = MemoryToolRuntimeImpl(config, deps[Api.vector_io], deps[Api.inference], deps[Api.files])
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -8,7 +8,7 @@
from jinja2 import Template from jinja2 import Template
from llama_stack.apis.common.content_types import InterleavedContent from llama_stack.apis.common.content_types import InterleavedContent
from llama_stack.apis.inference import UserMessage from llama_stack.apis.inference import OpenAIUserMessageParam
from llama_stack.apis.tools.rag_tool import ( from llama_stack.apis.tools.rag_tool import (
DefaultRAGQueryGeneratorConfig, DefaultRAGQueryGeneratorConfig,
LLMRAGQueryGeneratorConfig, LLMRAGQueryGeneratorConfig,
@ -61,16 +61,16 @@ async def llm_rag_query_generator(
messages = [interleaved_content_as_str(content)] messages = [interleaved_content_as_str(content)]
template = Template(config.template) template = Template(config.template)
content = template.render({"messages": messages}) rendered_content: str = template.render({"messages": messages})
model = config.model model = config.model
message = UserMessage(content=content) message = OpenAIUserMessageParam(content=rendered_content)
response = await inference_api.chat_completion( response = await inference_api.openai_chat_completion(
model_id=model, model=model,
messages=[message], messages=[message],
stream=False, stream=False,
) )
query = response.completion_message.content query = response.choices[0].message.content
return query return query

View file

@ -5,10 +5,15 @@
# the root directory of this source tree. # the root directory of this source tree.
import asyncio import asyncio
import base64
import io
import mimetypes
import secrets import secrets
import string import string
from typing import Any from typing import Any
import httpx
from fastapi import UploadFile
from pydantic import TypeAdapter from pydantic import TypeAdapter
from llama_stack.apis.common.content_types import ( from llama_stack.apis.common.content_types import (
@ -17,6 +22,7 @@ from llama_stack.apis.common.content_types import (
InterleavedContentItem, InterleavedContentItem,
TextContentItem, TextContentItem,
) )
from llama_stack.apis.files import Files, OpenAIFilePurpose
from llama_stack.apis.inference import Inference from llama_stack.apis.inference import Inference
from llama_stack.apis.tools import ( from llama_stack.apis.tools import (
ListToolDefsResponse, ListToolDefsResponse,
@ -30,14 +36,16 @@ from llama_stack.apis.tools import (
ToolParameter, ToolParameter,
ToolRuntime, ToolRuntime,
) )
from llama_stack.apis.vector_io import QueryChunksResponse, VectorIO from llama_stack.apis.vector_io import (
QueryChunksResponse,
VectorIO,
VectorStoreChunkingStrategyStatic,
VectorStoreChunkingStrategyStaticConfig,
)
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate from llama_stack.providers.datatypes import ToolGroupsProtocolPrivate
from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
from llama_stack.providers.utils.memory.vector_store import ( from llama_stack.providers.utils.memory.vector_store import parse_data_url
content_from_doc,
make_overlapped_chunks,
)
from .config import RagToolRuntimeConfig from .config import RagToolRuntimeConfig
from .context_retriever import generate_rag_query from .context_retriever import generate_rag_query
@ -49,16 +57,59 @@ def make_random_string(length: int = 8):
return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length)) return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length))
async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
"""Get raw binary data and mime type from a RAGDocument for file upload."""
if isinstance(doc.content, URL):
if doc.content.uri.startswith("data:"):
parts = parse_data_url(doc.content.uri)
mime_type = parts["mimetype"]
data = parts["data"]
if parts["is_base64"]:
file_data = base64.b64decode(data)
else:
file_data = data.encode("utf-8")
return file_data, mime_type
else:
async with httpx.AsyncClient() as client:
r = await client.get(doc.content.uri)
r.raise_for_status()
mime_type = r.headers.get("content-type", "application/octet-stream")
return r.content, mime_type
else:
if isinstance(doc.content, str):
content_str = doc.content
else:
content_str = interleaved_content_as_str(doc.content)
if content_str.startswith("data:"):
parts = parse_data_url(content_str)
mime_type = parts["mimetype"]
data = parts["data"]
if parts["is_base64"]:
file_data = base64.b64decode(data)
else:
file_data = data.encode("utf-8")
return file_data, mime_type
else:
return content_str.encode("utf-8"), "text/plain"
class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime): class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRuntime):
def __init__( def __init__(
self, self,
config: RagToolRuntimeConfig, config: RagToolRuntimeConfig,
vector_io_api: VectorIO, vector_io_api: VectorIO,
inference_api: Inference, inference_api: Inference,
files_api: Files,
): ):
self.config = config self.config = config
self.vector_io_api = vector_io_api self.vector_io_api = vector_io_api
self.inference_api = inference_api self.inference_api = inference_api
self.files_api = files_api
async def initialize(self): async def initialize(self):
pass pass
@ -78,27 +129,56 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
vector_db_id: str, vector_db_id: str,
chunk_size_in_tokens: int = 512, chunk_size_in_tokens: int = 512,
) -> None: ) -> None:
chunks = [] if not documents:
for doc in documents:
content = await content_from_doc(doc)
# TODO: we should add enrichment here as URLs won't be added to the metadata by default
chunks.extend(
make_overlapped_chunks(
doc.document_id,
content,
chunk_size_in_tokens,
chunk_size_in_tokens // 4,
doc.metadata,
)
)
if not chunks:
return return
await self.vector_io_api.insert_chunks( for doc in documents:
chunks=chunks, try:
vector_db_id=vector_db_id, try:
) file_data, mime_type = await raw_data_from_doc(doc)
except Exception as e:
log.error(f"Failed to extract content from document {doc.document_id}: {e}")
continue
file_extension = mimetypes.guess_extension(mime_type) or ".txt"
filename = doc.metadata.get("filename", f"{doc.document_id}{file_extension}")
file_obj = io.BytesIO(file_data)
file_obj.name = filename
upload_file = UploadFile(file=file_obj, filename=filename)
try:
created_file = await self.files_api.openai_upload_file(
file=upload_file, purpose=OpenAIFilePurpose.ASSISTANTS
)
except Exception as e:
log.error(f"Failed to upload file for document {doc.document_id}: {e}")
continue
chunking_strategy = VectorStoreChunkingStrategyStatic(
static=VectorStoreChunkingStrategyStaticConfig(
max_chunk_size_tokens=chunk_size_in_tokens,
chunk_overlap_tokens=chunk_size_in_tokens // 4,
)
)
try:
await self.vector_io_api.openai_attach_file_to_vector_store(
vector_store_id=vector_db_id,
file_id=created_file.id,
attributes=doc.metadata,
chunking_strategy=chunking_strategy,
)
except Exception as e:
log.error(
f"Failed to attach file {created_file.id} to vector store {vector_db_id} for document {doc.document_id}: {e}"
)
continue
except Exception as e:
log.error(f"Unexpected error processing document {doc.document_id}: {e}")
continue
async def query( async def query(
self, self,
@ -131,8 +211,18 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
for vector_db_id in vector_db_ids for vector_db_id in vector_db_ids
] ]
results: list[QueryChunksResponse] = await asyncio.gather(*tasks) results: list[QueryChunksResponse] = await asyncio.gather(*tasks)
chunks = [c for r in results for c in r.chunks]
scores = [s for r in results for s in r.scores] chunks = []
scores = []
for vector_db_id, result in zip(vector_db_ids, results, strict=False):
for chunk, score in zip(result.chunks, result.scores, strict=False):
if not hasattr(chunk, "metadata") or chunk.metadata is None:
chunk.metadata = {}
chunk.metadata["vector_db_id"] = vector_db_id
chunks.append(chunk)
scores.append(score)
if not chunks: if not chunks:
return RAGQueryResult(content=None) return RAGQueryResult(content=None)
@ -167,6 +257,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
metadata_keys_to_exclude_from_context = [ metadata_keys_to_exclude_from_context = [
"token_count", "token_count",
"metadata_token_count", "metadata_token_count",
"vector_db_id",
] ]
metadata_for_context = {} metadata_for_context = {}
for k in chunk_metadata_keys_to_include_from_context: for k in chunk_metadata_keys_to_include_from_context:
@ -191,6 +282,7 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
"document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]], "document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
"chunks": [c.content for c in chunks[: len(picked)]], "chunks": [c.content for c in chunks[: len(picked)]],
"scores": scores[: len(picked)], "scores": scores[: len(picked)],
"vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],
}, },
) )
@ -226,7 +318,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
if query_config: if query_config:
query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config) query_config = TypeAdapter(RAGQueryConfig).validate_python(query_config)
else: else:
# handle someone passing an empty dict
query_config = RAGQueryConfig() query_config = RAGQueryConfig()
query = kwargs["query"] query = kwargs["query"]
@ -237,6 +328,6 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
) )
return ToolInvocationResult( return ToolInvocationResult(
content=result.content, content=result.content or [],
metadata=result.metadata, metadata=result.metadata,
) )

View file

@ -30,11 +30,11 @@ from llama_stack.providers.utils.kvstore.api import KVStore
from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
from llama_stack.providers.utils.memory.vector_store import ( from llama_stack.providers.utils.memory.vector_store import (
RERANKER_TYPE_RRF, RERANKER_TYPE_RRF,
RERANKER_TYPE_WEIGHTED,
ChunkForDeletion, ChunkForDeletion,
EmbeddingIndex, EmbeddingIndex,
VectorDBWithIndex, VectorDBWithIndex,
) )
from llama_stack.providers.utils.vector_io.vector_utils import WeightedInMemoryAggregator
logger = get_logger(name=__name__, category="vector_io") logger = get_logger(name=__name__, category="vector_io")
@ -66,59 +66,6 @@ def _create_sqlite_connection(db_path):
return connection return connection
def _normalize_scores(scores: dict[str, float]) -> dict[str, float]:
"""Normalize scores to [0,1] range using min-max normalization."""
if not scores:
return {}
min_score = min(scores.values())
max_score = max(scores.values())
score_range = max_score - min_score
if score_range > 0:
return {doc_id: (score - min_score) / score_range for doc_id, score in scores.items()}
return dict.fromkeys(scores, 1.0)
def _weighted_rerank(
vector_scores: dict[str, float],
keyword_scores: dict[str, float],
alpha: float = 0.5,
) -> dict[str, float]:
"""ReRanker that uses weighted average of scores."""
all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
normalized_vector_scores = _normalize_scores(vector_scores)
normalized_keyword_scores = _normalize_scores(keyword_scores)
return {
doc_id: (alpha * normalized_keyword_scores.get(doc_id, 0.0))
+ ((1 - alpha) * normalized_vector_scores.get(doc_id, 0.0))
for doc_id in all_ids
}
def _rrf_rerank(
vector_scores: dict[str, float],
keyword_scores: dict[str, float],
impact_factor: float = 60.0,
) -> dict[str, float]:
"""ReRanker that uses Reciprocal Rank Fusion."""
# Convert scores to ranks
vector_ranks = {
doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(vector_scores.items(), key=lambda x: x[1], reverse=True))
}
keyword_ranks = {
doc_id: i + 1 for i, (doc_id, _) in enumerate(sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True))
}
all_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
rrf_scores = {}
for doc_id in all_ids:
vector_rank = vector_ranks.get(doc_id, float("inf"))
keyword_rank = keyword_ranks.get(doc_id, float("inf"))
# RRF formula: score = 1/(k + r) where k is impact_factor and r is the rank
rrf_scores[doc_id] = (1.0 / (impact_factor + vector_rank)) + (1.0 / (impact_factor + keyword_rank))
return rrf_scores
def _make_sql_identifier(name: str) -> str: def _make_sql_identifier(name: str) -> str:
return re.sub(r"[^a-zA-Z0-9_]", "_", name) return re.sub(r"[^a-zA-Z0-9_]", "_", name)
@ -398,14 +345,10 @@ class SQLiteVecIndex(EmbeddingIndex):
for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False) for chunk, score in zip(keyword_response.chunks, keyword_response.scores, strict=False)
} }
# Combine scores using the specified reranker # Combine scores using the reranking utility
if reranker_type == RERANKER_TYPE_WEIGHTED: combined_scores = WeightedInMemoryAggregator.combine_search_results(
alpha = reranker_params.get("alpha", 0.5) vector_scores, keyword_scores, reranker_type, reranker_params
combined_scores = _weighted_rerank(vector_scores, keyword_scores, alpha) )
else:
# Default to RRF for None, RRF, or any unknown types
impact_factor = reranker_params.get("impact_factor", 60.0)
combined_scores = _rrf_rerank(vector_scores, keyword_scores, impact_factor)
# Sort by combined score and get top k results # Sort by combined score and get top k results
sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

View file

@ -13,7 +13,7 @@ def available_providers() -> list[ProviderSpec]:
InlineProviderSpec( InlineProviderSpec(
api=Api.batches, api=Api.batches,
provider_type="inline::reference", provider_type="inline::reference",
pip_packages=["openai"], pip_packages=[],
module="llama_stack.providers.inline.batches.reference", module="llama_stack.providers.inline.batches.reference",
config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig", config_class="llama_stack.providers.inline.batches.reference.config.ReferenceBatchesImplConfig",
api_dependencies=[ api_dependencies=[

View file

@ -30,7 +30,7 @@ def available_providers() -> list[ProviderSpec]:
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="huggingface", adapter_type="huggingface",
pip_packages=[ pip_packages=[
"datasets", "datasets>=4.0.0",
], ],
module="llama_stack.providers.remote.datasetio.huggingface", module="llama_stack.providers.remote.datasetio.huggingface",
config_class="llama_stack.providers.remote.datasetio.huggingface.HuggingfaceDatasetIOConfig", config_class="llama_stack.providers.remote.datasetio.huggingface.HuggingfaceDatasetIOConfig",
@ -42,7 +42,7 @@ def available_providers() -> list[ProviderSpec]:
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="nvidia", adapter_type="nvidia",
pip_packages=[ pip_packages=[
"datasets", "datasets>=4.0.0",
], ],
module="llama_stack.providers.remote.datasetio.nvidia", module="llama_stack.providers.remote.datasetio.nvidia",
config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig", config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig",

View file

@ -75,7 +75,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="vllm", adapter_type="vllm",
pip_packages=["openai"], pip_packages=[],
module="llama_stack.providers.remote.inference.vllm", module="llama_stack.providers.remote.inference.vllm",
config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig", config_class="llama_stack.providers.remote.inference.vllm.VLLMInferenceAdapterConfig",
description="Remote vLLM inference provider for connecting to vLLM servers.", description="Remote vLLM inference provider for connecting to vLLM servers.",
@ -116,7 +116,7 @@ def available_providers() -> list[ProviderSpec]:
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="fireworks", adapter_type="fireworks",
pip_packages=[ pip_packages=[
"fireworks-ai<=0.18.0", "fireworks-ai<=0.17.16",
], ],
module="llama_stack.providers.remote.inference.fireworks", module="llama_stack.providers.remote.inference.fireworks",
config_class="llama_stack.providers.remote.inference.fireworks.FireworksImplConfig", config_class="llama_stack.providers.remote.inference.fireworks.FireworksImplConfig",
@ -151,9 +151,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="databricks", adapter_type="databricks",
pip_packages=[ pip_packages=[],
"openai",
],
module="llama_stack.providers.remote.inference.databricks", module="llama_stack.providers.remote.inference.databricks",
config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig", config_class="llama_stack.providers.remote.inference.databricks.DatabricksImplConfig",
description="Databricks inference provider for running models on Databricks' unified analytics platform.", description="Databricks inference provider for running models on Databricks' unified analytics platform.",
@ -163,9 +161,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="nvidia", adapter_type="nvidia",
pip_packages=[ pip_packages=[],
"openai",
],
module="llama_stack.providers.remote.inference.nvidia", module="llama_stack.providers.remote.inference.nvidia",
config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig", config_class="llama_stack.providers.remote.inference.nvidia.NVIDIAConfig",
description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.", description="NVIDIA inference provider for accessing NVIDIA NIM models and AI services.",
@ -175,7 +171,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="runpod", adapter_type="runpod",
pip_packages=["openai"], pip_packages=[],
module="llama_stack.providers.remote.inference.runpod", module="llama_stack.providers.remote.inference.runpod",
config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig", config_class="llama_stack.providers.remote.inference.runpod.RunpodImplConfig",
description="RunPod inference provider for running models on RunPod's cloud GPU platform.", description="RunPod inference provider for running models on RunPod's cloud GPU platform.",
@ -292,11 +288,26 @@ Available Models:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="watsonx", adapter_type="watsonx",
pip_packages=["ibm_watson_machine_learning"], pip_packages=["ibm_watsonx_ai"],
module="llama_stack.providers.remote.inference.watsonx", module="llama_stack.providers.remote.inference.watsonx",
config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig", config_class="llama_stack.providers.remote.inference.watsonx.WatsonXConfig",
provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.watsonx.WatsonXProviderDataValidator",
description="IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform.", description="IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform.",
), ),
), ),
remote_provider_spec(
api=Api.inference,
adapter=AdapterSpec(
adapter_type="azure",
pip_packages=["litellm"],
module="llama_stack.providers.remote.inference.azure",
config_class="llama_stack.providers.remote.inference.azure.AzureConfig",
provider_data_validator="llama_stack.providers.remote.inference.azure.config.AzureProviderDataValidator",
description="""
Azure OpenAI inference provider for accessing GPT models and other Azure services.
Provider documentation
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
""",
),
),
] ]

View file

@ -48,7 +48,7 @@ def available_providers() -> list[ProviderSpec]:
InlineProviderSpec( InlineProviderSpec(
api=Api.post_training, api=Api.post_training,
provider_type="inline::huggingface-gpu", provider_type="inline::huggingface-gpu",
pip_packages=["trl", "transformers", "peft", "datasets", "torch"], pip_packages=["trl", "transformers", "peft", "datasets>=4.0.0", "torch"],
module="llama_stack.providers.inline.post_training.huggingface", module="llama_stack.providers.inline.post_training.huggingface",
config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig", config_class="llama_stack.providers.inline.post_training.huggingface.HuggingFacePostTrainingConfig",
api_dependencies=[ api_dependencies=[

View file

@ -38,7 +38,7 @@ def available_providers() -> list[ProviderSpec]:
InlineProviderSpec( InlineProviderSpec(
api=Api.scoring, api=Api.scoring,
provider_type="inline::braintrust", provider_type="inline::braintrust",
pip_packages=["autoevals", "openai"], pip_packages=["autoevals"],
module="llama_stack.providers.inline.scoring.braintrust", module="llama_stack.providers.inline.scoring.braintrust",
config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig", config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
api_dependencies=[ api_dependencies=[

View file

@ -32,7 +32,7 @@ def available_providers() -> list[ProviderSpec]:
], ],
module="llama_stack.providers.inline.tool_runtime.rag", module="llama_stack.providers.inline.tool_runtime.rag",
config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig", config_class="llama_stack.providers.inline.tool_runtime.rag.config.RagToolRuntimeConfig",
api_dependencies=[Api.vector_io, Api.inference], api_dependencies=[Api.vector_io, Api.inference, Api.files],
description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.", description="RAG (Retrieval-Augmented Generation) tool runtime for document ingestion, chunking, and semantic search.",
), ),
remote_provider_spec( remote_provider_spec(

View file

@ -5,12 +5,13 @@
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import AnthropicConfig from .config import AnthropicConfig
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class AnthropicInferenceAdapter(LiteLLMOpenAIMixin): class AnthropicInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
def __init__(self, config: AnthropicConfig) -> None: def __init__(self, config: AnthropicConfig) -> None:
LiteLLMOpenAIMixin.__init__( LiteLLMOpenAIMixin.__init__(
self, self,
@ -26,3 +27,8 @@ class AnthropicInferenceAdapter(LiteLLMOpenAIMixin):
async def shutdown(self) -> None: async def shutdown(self) -> None:
await super().shutdown() await super().shutdown()
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self):
return "https://api.anthropic.com/v1"

View file

@ -0,0 +1,15 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .config import AzureConfig
async def get_adapter_impl(config: AzureConfig, _deps):
from .azure import AzureInferenceAdapter
impl = AzureInferenceAdapter(config)
await impl.initialize()
return impl

View file

@ -0,0 +1,64 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any
from urllib.parse import urljoin
from llama_stack.apis.inference import ChatCompletionRequest
from llama_stack.providers.utils.inference.litellm_openai_mixin import (
LiteLLMOpenAIMixin,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import AzureConfig
from .models import MODEL_ENTRIES
class AzureInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
def __init__(self, config: AzureConfig) -> None:
LiteLLMOpenAIMixin.__init__(
self,
MODEL_ENTRIES,
litellm_provider_name="azure",
api_key_from_config=config.api_key.get_secret_value(),
provider_data_api_key_field="azure_api_key",
openai_compat_api_base=str(config.api_base),
)
self.config = config
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str:
"""
Get the Azure API base URL.
Returns the Azure API base URL from the configuration.
"""
return urljoin(str(self.config.api_base), "/openai/v1")
async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
# Get base parameters from parent
params = await super()._get_params(request)
# Add Azure specific parameters
provider_data = self.get_request_provider_data()
if provider_data:
if getattr(provider_data, "azure_api_key", None):
params["api_key"] = provider_data.azure_api_key
if getattr(provider_data, "azure_api_base", None):
params["api_base"] = provider_data.azure_api_base
if getattr(provider_data, "azure_api_version", None):
params["api_version"] = provider_data.azure_api_version
if getattr(provider_data, "azure_api_type", None):
params["api_type"] = provider_data.azure_api_type
else:
params["api_key"] = self.config.api_key.get_secret_value()
params["api_base"] = str(self.config.api_base)
params["api_version"] = self.config.api_version
params["api_type"] = self.config.api_type
return params

View file

@ -0,0 +1,63 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
from typing import Any
from pydantic import BaseModel, Field, HttpUrl, SecretStr
from llama_stack.schema_utils import json_schema_type
class AzureProviderDataValidator(BaseModel):
azure_api_key: SecretStr = Field(
description="Azure API key for Azure",
)
azure_api_base: HttpUrl = Field(
description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
)
azure_api_version: str | None = Field(
default=None,
description="Azure API version for Azure (e.g., 2024-06-01)",
)
azure_api_type: str | None = Field(
default="azure",
description="Azure API type for Azure (e.g., azure)",
)
@json_schema_type
class AzureConfig(BaseModel):
api_key: SecretStr = Field(
description="Azure API key for Azure",
)
api_base: HttpUrl = Field(
description="Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com)",
)
api_version: str | None = Field(
default_factory=lambda: os.getenv("AZURE_API_VERSION"),
description="Azure API version for Azure (e.g., 2024-12-01-preview)",
)
api_type: str | None = Field(
default_factory=lambda: os.getenv("AZURE_API_TYPE", "azure"),
description="Azure API type for Azure (e.g., azure)",
)
@classmethod
def sample_run_config(
cls,
api_key: str = "${env.AZURE_API_KEY:=}",
api_base: str = "${env.AZURE_API_BASE:=}",
api_version: str = "${env.AZURE_API_VERSION:=}",
api_type: str = "${env.AZURE_API_TYPE:=}",
**kwargs,
) -> dict[str, Any]:
return {
"api_key": api_key,
"api_base": api_base,
"api_version": api_version,
"api_type": api_type,
}

View file

@ -0,0 +1,28 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.providers.utils.inference.model_registry import (
ProviderModelEntry,
)
# https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models?tabs=global-standard%2Cstandard-chat-completions
LLM_MODEL_IDS = [
"gpt-5",
"gpt-5-mini",
"gpt-5-nano",
"gpt-5-chat",
"o1",
"o1-mini",
"o3-mini",
"o4-mini",
"gpt-4.1",
"gpt-4.1-mini",
"gpt-4.1-nano",
]
SAFETY_MODELS_ENTRIES = list[ProviderModelEntry]()
MODEL_ENTRIES = [ProviderModelEntry(provider_model_id=m) for m in LLM_MODEL_IDS] + SAFETY_MODELS_ENTRIES

View file

@ -53,6 +53,43 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
REGION_PREFIX_MAP = {
"us": "us.",
"eu": "eu.",
"ap": "ap.",
}
def _get_region_prefix(region: str | None) -> str:
# AWS requires region prefixes for inference profiles
if region is None:
return "us." # default to US when we don't know
# Handle case insensitive region matching
region_lower = region.lower()
for prefix in REGION_PREFIX_MAP:
if region_lower.startswith(f"{prefix}-"):
return REGION_PREFIX_MAP[prefix]
# Fallback to US for anything we don't recognize
return "us."
def _to_inference_profile_id(model_id: str, region: str = None) -> str:
# Return ARNs unchanged
if model_id.startswith("arn:"):
return model_id
# Return inference profile IDs that already have regional prefixes
if any(model_id.startswith(p) for p in REGION_PREFIX_MAP.values()):
return model_id
# Default to US East when no region is provided
if region is None:
region = "us-east-1"
return _get_region_prefix(region) + model_id
class BedrockInferenceAdapter( class BedrockInferenceAdapter(
ModelRegistryHelper, ModelRegistryHelper,
@ -166,8 +203,13 @@ class BedrockInferenceAdapter(
options["repetition_penalty"] = sampling_params.repetition_penalty options["repetition_penalty"] = sampling_params.repetition_penalty
prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model)) prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model))
# Convert foundation model ID to inference profile ID
region_name = self.client.meta.region_name
inference_profile_id = _to_inference_profile_id(bedrock_model, region_name)
return { return {
"modelId": bedrock_model, "modelId": inference_profile_id,
"body": json.dumps( "body": json.dumps(
{ {
"prompt": prompt, "prompt": prompt,
@ -185,6 +227,11 @@ class BedrockInferenceAdapter(
task_type: EmbeddingTaskType | None = None, task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse: ) -> EmbeddingsResponse:
model = await self.model_store.get_model(model_id) model = await self.model_store.get_model(model_id)
# Convert foundation model ID to inference profile ID
region_name = self.client.meta.region_name
inference_profile_id = _to_inference_profile_id(model.provider_resource_id, region_name)
embeddings = [] embeddings = []
for content in contents: for content in contents:
assert not content_has_media(content), "Bedrock does not support media for embeddings" assert not content_has_media(content), "Bedrock does not support media for embeddings"
@ -193,7 +240,7 @@ class BedrockInferenceAdapter(
body = json.dumps(input_body) body = json.dumps(input_body)
response = self.client.invoke_model( response = self.client.invoke_model(
body=body, body=body,
modelId=model.provider_resource_id, modelId=inference_profile_id,
accept="application/json", accept="application/json",
contentType="application/json", contentType="application/json",
) )

View file

@ -5,12 +5,13 @@
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import GeminiConfig from .config import GeminiConfig
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class GeminiInferenceAdapter(LiteLLMOpenAIMixin): class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
def __init__(self, config: GeminiConfig) -> None: def __init__(self, config: GeminiConfig) -> None:
LiteLLMOpenAIMixin.__init__( LiteLLMOpenAIMixin.__init__(
self, self,
@ -21,6 +22,11 @@ class GeminiInferenceAdapter(LiteLLMOpenAIMixin):
) )
self.config = config self.config = config
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self):
return "https://generativelanguage.googleapis.com/v1beta/openai/"
async def initialize(self) -> None: async def initialize(self) -> None:
await super().initialize() await super().initialize()

View file

@ -4,30 +4,15 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from collections.abc import AsyncIterator
from typing import Any
from openai import AsyncOpenAI
from llama_stack.apis.inference import (
OpenAIChatCompletion,
OpenAIChatCompletionChunk,
OpenAIChoiceDelta,
OpenAIChunkChoice,
OpenAIMessageParam,
OpenAIResponseFormatParam,
OpenAISystemMessageParam,
)
from llama_stack.providers.remote.inference.groq.config import GroqConfig from llama_stack.providers.remote.inference.groq.config import GroqConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_compat import ( from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
prepare_openai_completion_params,
)
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class GroqInferenceAdapter(LiteLLMOpenAIMixin): class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
_config: GroqConfig _config: GroqConfig
def __init__(self, config: GroqConfig): def __init__(self, config: GroqConfig):
@ -40,122 +25,14 @@ class GroqInferenceAdapter(LiteLLMOpenAIMixin):
) )
self.config = config self.config = config
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str:
return f"{self.config.url}/openai/v1"
async def initialize(self): async def initialize(self):
await super().initialize() await super().initialize()
async def shutdown(self): async def shutdown(self):
await super().shutdown() await super().shutdown()
def _get_openai_client(self) -> AsyncOpenAI:
return AsyncOpenAI(
base_url=f"{self.config.url}/openai/v1",
api_key=self.get_api_key(),
)
async def openai_chat_completion(
self,
model: str,
messages: list[OpenAIMessageParam],
frequency_penalty: float | None = None,
function_call: str | dict[str, Any] | None = None,
functions: list[dict[str, Any]] | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_completion_tokens: int | None = None,
max_tokens: int | None = None,
n: int | None = None,
parallel_tool_calls: bool | None = None,
presence_penalty: float | None = None,
response_format: OpenAIResponseFormatParam | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
tool_choice: str | dict[str, Any] | None = None,
tools: list[dict[str, Any]] | None = None,
top_logprobs: int | None = None,
top_p: float | None = None,
user: str | None = None,
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
model_obj = await self.model_store.get_model(model)
# Groq does not support json_schema response format, so we need to convert it to json_object
if response_format and response_format.type == "json_schema":
response_format.type = "json_object"
schema = response_format.json_schema.get("schema", {})
response_format.json_schema = None
json_instructions = f"\nYour response should be a JSON object that matches the following schema: {schema}"
if messages and messages[0].role == "system":
messages[0].content = messages[0].content + json_instructions
else:
messages.insert(0, OpenAISystemMessageParam(content=json_instructions))
# Groq returns a 400 error if tools are provided but none are called
# So, set tool_choice to "required" to attempt to force a call
if tools and (not tool_choice or tool_choice == "auto"):
tool_choice = "required"
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
# Groq does not support streaming requests that set response_format
fake_stream = False
if stream and response_format:
params["stream"] = False
fake_stream = True
response = await self._get_openai_client().chat.completions.create(**params)
if fake_stream:
chunk_choices = []
for choice in response.choices:
delta = OpenAIChoiceDelta(
content=choice.message.content,
role=choice.message.role,
tool_calls=choice.message.tool_calls,
)
chunk_choice = OpenAIChunkChoice(
delta=delta,
finish_reason=choice.finish_reason,
index=choice.index,
logprobs=None,
)
chunk_choices.append(chunk_choice)
chunk = OpenAIChatCompletionChunk(
id=response.id,
choices=chunk_choices,
object="chat.completion.chunk",
created=response.created,
model=response.model,
)
async def _fake_stream_generator():
yield chunk
return _fake_stream_generator()
else:
return response

View file

@ -118,10 +118,10 @@ class OllamaInferenceAdapter(
async def initialize(self) -> None: async def initialize(self) -> None:
logger.info(f"checking connectivity to Ollama at `{self.config.url}`...") logger.info(f"checking connectivity to Ollama at `{self.config.url}`...")
health_response = await self.health() r = await self.health()
if health_response["status"] == HealthStatus.ERROR: if r["status"] == HealthStatus.ERROR:
logger.warning( logger.warning(
"Ollama Server is not running, make sure to start it using `ollama serve` in a separate terminal" f"Ollama Server is not running (message: {r['message']}). Make sure to start it using `ollama serve` in a separate terminal"
) )
async def should_refresh_models(self) -> bool: async def should_refresh_models(self) -> bool:
@ -156,7 +156,7 @@ class OllamaInferenceAdapter(
), ),
Model( Model(
identifier="nomic-embed-text", identifier="nomic-embed-text",
provider_resource_id="nomic-embed-text", provider_resource_id="nomic-embed-text:latest",
provider_id=provider_id, provider_id=provider_id,
metadata={ metadata={
"embedding_dimension": 768, "embedding_dimension": 768,

View file

@ -4,13 +4,26 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import SambaNovaImplConfig from .config import SambaNovaImplConfig
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin): class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
"""
SambaNova Inference Adapter for Llama Stack.
Note: The inheritance order is important here. OpenAIMixin must come before
LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
is used instead of LiteLLMOpenAIMixin.check_model_availability().
- OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists
- LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM
"""
def __init__(self, config: SambaNovaImplConfig): def __init__(self, config: SambaNovaImplConfig):
self.config = config self.config = config
self.environment_available_models = [] self.environment_available_models = []
@ -24,3 +37,14 @@ class SambaNovaInferenceAdapter(LiteLLMOpenAIMixin):
download_images=True, # SambaNova requires base64 image encoding download_images=True, # SambaNova requires base64 image encoding
json_schema_strict=False, # SambaNova doesn't support strict=True yet json_schema_strict=False, # SambaNova doesn't support strict=True yet
) )
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str:
"""
Get the base URL for OpenAI mixin.
:return: The SambaNova base URL
"""
return self.config.url

View file

@ -6,16 +6,20 @@
from typing import Any from typing import Any
import google.auth.transport.requests
from google.auth import default
from llama_stack.apis.inference import ChatCompletionRequest from llama_stack.apis.inference import ChatCompletionRequest
from llama_stack.providers.utils.inference.litellm_openai_mixin import ( from llama_stack.providers.utils.inference.litellm_openai_mixin import (
LiteLLMOpenAIMixin, LiteLLMOpenAIMixin,
) )
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import VertexAIConfig from .config import VertexAIConfig
from .models import MODEL_ENTRIES from .models import MODEL_ENTRIES
class VertexAIInferenceAdapter(LiteLLMOpenAIMixin): class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
def __init__(self, config: VertexAIConfig) -> None: def __init__(self, config: VertexAIConfig) -> None:
LiteLLMOpenAIMixin.__init__( LiteLLMOpenAIMixin.__init__(
self, self,
@ -27,9 +31,30 @@ class VertexAIInferenceAdapter(LiteLLMOpenAIMixin):
self.config = config self.config = config
def get_api_key(self) -> str: def get_api_key(self) -> str:
# Vertex AI doesn't use API keys, it uses Application Default Credentials """
# Return empty string to let litellm handle authentication via ADC Get an access token for Vertex AI using Application Default Credentials.
return ""
Vertex AI uses ADC instead of API keys. This method obtains an access token
from the default credentials and returns it for use with the OpenAI-compatible client.
"""
try:
# Get default credentials - will read from GOOGLE_APPLICATION_CREDENTIALS
credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
credentials.refresh(google.auth.transport.requests.Request())
return str(credentials.token)
except Exception:
# If we can't get credentials, return empty string to let LiteLLM handle it
# This allows the LiteLLM mixin to work with ADC directly
return ""
def get_base_url(self) -> str:
"""
Get the Vertex AI OpenAI-compatible API base URL.
Returns the Vertex AI OpenAI-compatible endpoint URL.
Source: https://cloud.google.com/vertex-ai/generative-ai/docs/start/openai
"""
return f"https://{self.config.location}-aiplatform.googleapis.com/v1/projects/{self.config.project}/locations/{self.config.location}/endpoints/openapi"
async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]: async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
# Get base parameters from parent # Get base parameters from parent

View file

@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import json import json
from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import AsyncGenerator
from typing import Any from typing import Any
import httpx import httpx
@ -38,13 +38,6 @@ from llama_stack.apis.inference import (
LogProbConfig, LogProbConfig,
Message, Message,
ModelStore, ModelStore,
OpenAIChatCompletion,
OpenAICompletion,
OpenAIEmbeddingData,
OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage,
OpenAIMessageParam,
OpenAIResponseFormatParam,
ResponseFormat, ResponseFormat,
SamplingParams, SamplingParams,
TextTruncation, TextTruncation,
@ -71,11 +64,11 @@ from llama_stack.providers.utils.inference.openai_compat import (
convert_message_to_openai_dict, convert_message_to_openai_dict,
convert_tool_call, convert_tool_call,
get_sampling_options, get_sampling_options,
prepare_openai_completion_params,
process_chat_completion_stream_response, process_chat_completion_stream_response,
process_completion_response, process_completion_response,
process_completion_stream_response, process_completion_stream_response,
) )
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import ( from llama_stack.providers.utils.inference.prompt_adapter import (
completion_request_to_prompt, completion_request_to_prompt,
content_has_media, content_has_media,
@ -288,7 +281,7 @@ async def _process_vllm_chat_completion_stream_response(
yield c yield c
class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): class VLLMInferenceAdapter(OpenAIMixin, Inference, ModelsProtocolPrivate):
# automatically set by the resolver when instantiating the provider # automatically set by the resolver when instantiating the provider
__provider_id__: str __provider_id__: str
model_store: ModelStore | None = None model_store: ModelStore | None = None
@ -296,7 +289,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
def __init__(self, config: VLLMInferenceAdapterConfig) -> None: def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries()) self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
self.config = config self.config = config
self.client = None
async def initialize(self) -> None: async def initialize(self) -> None:
if not self.config.url: if not self.config.url:
@ -308,8 +300,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
return self.config.refresh_models return self.config.refresh_models
async def list_models(self) -> list[Model] | None: async def list_models(self) -> list[Model] | None:
self._lazy_initialize_client()
assert self.client is not None # mypy
models = [] models = []
async for m in self.client.models.list(): async for m in self.client.models.list():
model_type = ModelType.llm # unclear how to determine embedding vs. llm models model_type = ModelType.llm # unclear how to determine embedding vs. llm models
@ -340,8 +330,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
HealthResponse: A dictionary containing the health status. HealthResponse: A dictionary containing the health status.
""" """
try: try:
client = self._create_client() if self.client is None else self.client _ = [m async for m in self.client.models.list()] # Ensure the client is initialized
_ = [m async for m in client.models.list()] # Ensure the client is initialized
return HealthResponse(status=HealthStatus.OK) return HealthResponse(status=HealthStatus.OK)
except Exception as e: except Exception as e:
return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}") return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
@ -351,19 +340,14 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
raise ValueError("Model store not set") raise ValueError("Model store not set")
return await self.model_store.get_model(model_id) return await self.model_store.get_model(model_id)
def _lazy_initialize_client(self): def get_api_key(self):
if self.client is not None: return self.config.api_token
return
log.info(f"Initializing vLLM client with base_url={self.config.url}") def get_base_url(self):
self.client = self._create_client() return self.config.url
def _create_client(self): def get_extra_client_params(self):
return AsyncOpenAI( return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)}
base_url=self.config.url,
api_key=self.config.api_token,
http_client=httpx.AsyncClient(verify=self.config.tls_verify),
)
async def completion( async def completion(
self, self,
@ -374,7 +358,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
stream: bool | None = False, stream: bool | None = False,
logprobs: LogProbConfig | None = None, logprobs: LogProbConfig | None = None,
) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]: ) -> CompletionResponse | AsyncGenerator[CompletionResponseStreamChunk, None]:
self._lazy_initialize_client()
if sampling_params is None: if sampling_params is None:
sampling_params = SamplingParams() sampling_params = SamplingParams()
model = await self._get_model(model_id) model = await self._get_model(model_id)
@ -406,7 +389,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
logprobs: LogProbConfig | None = None, logprobs: LogProbConfig | None = None,
tool_config: ToolConfig | None = None, tool_config: ToolConfig | None = None,
) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]: ) -> ChatCompletionResponse | AsyncGenerator[ChatCompletionResponseStreamChunk, None]:
self._lazy_initialize_client()
if sampling_params is None: if sampling_params is None:
sampling_params = SamplingParams() sampling_params = SamplingParams()
model = await self._get_model(model_id) model = await self._get_model(model_id)
@ -479,16 +461,12 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
yield chunk yield chunk
async def register_model(self, model: Model) -> Model: async def register_model(self, model: Model) -> Model:
# register_model is called during Llama Stack initialization, hence we cannot init self.client if not initialized yet.
# self.client should only be created after the initialization is complete to avoid asyncio cross-context errors.
# Changing this may lead to unpredictable behavior.
client = self._create_client() if self.client is None else self.client
try: try:
model = await self.register_helper.register_model(model) model = await self.register_helper.register_model(model)
except ValueError: except ValueError:
pass # Ignore statically unknown model, will check live listing pass # Ignore statically unknown model, will check live listing
try: try:
res = await client.models.list() res = await self.client.models.list()
except APIConnectionError as e: except APIConnectionError as e:
raise ValueError( raise ValueError(
f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL." f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
@ -543,8 +521,6 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
output_dimension: int | None = None, output_dimension: int | None = None,
task_type: EmbeddingTaskType | None = None, task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse: ) -> EmbeddingsResponse:
self._lazy_initialize_client()
assert self.client is not None
model = await self._get_model(model_id) model = await self._get_model(model_id)
kwargs = {} kwargs = {}
@ -560,154 +536,3 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
embeddings = [data.embedding for data in response.data] embeddings = [data.embedding for data in response.data]
return EmbeddingsResponse(embeddings=embeddings) return EmbeddingsResponse(embeddings=embeddings)
async def openai_embeddings(
self,
model: str,
input: str | list[str],
encoding_format: str | None = "float",
dimensions: int | None = None,
user: str | None = None,
) -> OpenAIEmbeddingsResponse:
self._lazy_initialize_client()
assert self.client is not None
model_obj = await self._get_model(model)
assert model_obj.model_type == ModelType.embedding
# Convert input to list if it's a string
input_list = [input] if isinstance(input, str) else input
# Call vLLM embeddings endpoint with encoding_format
response = await self.client.embeddings.create(
model=model_obj.provider_resource_id,
input=input_list,
dimensions=dimensions,
encoding_format=encoding_format,
)
# Convert response to OpenAI format
data = [
OpenAIEmbeddingData(
embedding=embedding_data.embedding,
index=i,
)
for i, embedding_data in enumerate(response.data)
]
# Not returning actual token usage since vLLM doesn't provide it
usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
return OpenAIEmbeddingsResponse(
data=data,
model=model_obj.provider_resource_id,
usage=usage,
)
async def openai_completion(
self,
model: str,
prompt: str | list[str] | list[int] | list[list[int]],
best_of: int | None = None,
echo: bool | None = None,
frequency_penalty: float | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_tokens: int | None = None,
n: int | None = None,
presence_penalty: float | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
top_p: float | None = None,
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion:
self._lazy_initialize_client()
model_obj = await self._get_model(model)
extra_body: dict[str, Any] = {}
if prompt_logprobs is not None and prompt_logprobs >= 0:
extra_body["prompt_logprobs"] = prompt_logprobs
if guided_choice:
extra_body["guided_choice"] = guided_choice
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
prompt=prompt,
best_of=best_of,
echo=echo,
frequency_penalty=frequency_penalty,
logit_bias=logit_bias,
logprobs=logprobs,
max_tokens=max_tokens,
n=n,
presence_penalty=presence_penalty,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
top_p=top_p,
user=user,
extra_body=extra_body,
)
return await self.client.completions.create(**params) # type: ignore
async def openai_chat_completion(
self,
model: str,
messages: list[OpenAIMessageParam],
frequency_penalty: float | None = None,
function_call: str | dict[str, Any] | None = None,
functions: list[dict[str, Any]] | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_completion_tokens: int | None = None,
max_tokens: int | None = None,
n: int | None = None,
parallel_tool_calls: bool | None = None,
presence_penalty: float | None = None,
response_format: OpenAIResponseFormatParam | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
tool_choice: str | dict[str, Any] | None = None,
tools: list[dict[str, Any]] | None = None,
top_logprobs: int | None = None,
top_p: float | None = None,
user: str | None = None,
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
self._lazy_initialize_client()
model_obj = await self._get_model(model)
params = await prepare_openai_completion_params(
model=model_obj.provider_resource_id,
messages=messages,
frequency_penalty=frequency_penalty,
function_call=function_call,
functions=functions,
logit_bias=logit_bias,
logprobs=logprobs,
max_completion_tokens=max_completion_tokens,
max_tokens=max_tokens,
n=n,
parallel_tool_calls=parallel_tool_calls,
presence_penalty=presence_penalty,
response_format=response_format,
seed=seed,
stop=stop,
stream=stream,
stream_options=stream_options,
temperature=temperature,
tool_choice=tool_choice,
tools=tools,
top_logprobs=top_logprobs,
top_p=top_p,
user=user,
)
return await self.client.chat.completions.create(**params) # type: ignore

View file

@ -7,8 +7,8 @@
from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import AsyncGenerator, AsyncIterator
from typing import Any from typing import Any
from ibm_watson_machine_learning.foundation_models import Model from ibm_watsonx_ai.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from openai import AsyncOpenAI from openai import AsyncOpenAI
from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem from llama_stack.apis.common.content_types import InterleavedContent, InterleavedContentItem

View file

@ -4,53 +4,55 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import os
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
class BedrockBaseConfig(BaseModel): class BedrockBaseConfig(BaseModel):
aws_access_key_id: str | None = Field( aws_access_key_id: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_ACCESS_KEY_ID"),
description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID", description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID",
) )
aws_secret_access_key: str | None = Field( aws_secret_access_key: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_SECRET_ACCESS_KEY"),
description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY", description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY",
) )
aws_session_token: str | None = Field( aws_session_token: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_SESSION_TOKEN"),
description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN", description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN",
) )
region_name: str | None = Field( region_name: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_DEFAULT_REGION"),
description="The default AWS Region to use, for example, us-west-1 or us-west-2." description="The default AWS Region to use, for example, us-west-1 or us-west-2."
"Default use environment variable: AWS_DEFAULT_REGION", "Default use environment variable: AWS_DEFAULT_REGION",
) )
profile_name: str | None = Field( profile_name: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_PROFILE"),
description="The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE", description="The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE",
) )
total_max_attempts: int | None = Field( total_max_attempts: int | None = Field(
default=None, default_factory=lambda: int(val) if (val := os.getenv("AWS_MAX_ATTEMPTS")) else None,
description="An integer representing the maximum number of attempts that will be made for a single request, " description="An integer representing the maximum number of attempts that will be made for a single request, "
"including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS", "including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS",
) )
retry_mode: str | None = Field( retry_mode: str | None = Field(
default=None, default_factory=lambda: os.getenv("AWS_RETRY_MODE"),
description="A string representing the type of retries Boto3 will perform." description="A string representing the type of retries Boto3 will perform."
"Default use environment variable: AWS_RETRY_MODE", "Default use environment variable: AWS_RETRY_MODE",
) )
connect_timeout: float | None = Field( connect_timeout: float | None = Field(
default=60, default_factory=lambda: float(os.getenv("AWS_CONNECT_TIMEOUT", "60")),
description="The time in seconds till a timeout exception is thrown when attempting to make a connection. " description="The time in seconds till a timeout exception is thrown when attempting to make a connection. "
"The default is 60 seconds.", "The default is 60 seconds.",
) )
read_timeout: float | None = Field( read_timeout: float | None = Field(
default=60, default_factory=lambda: float(os.getenv("AWS_READ_TIMEOUT", "60")),
description="The time in seconds till a timeout exception is thrown when attempting to read from a connection." description="The time in seconds till a timeout exception is thrown when attempting to read from a connection."
"The default is 60 seconds.", "The default is 60 seconds.",
) )
session_ttl: int | None = Field( session_ttl: int | None = Field(
default=3600, default_factory=lambda: int(os.getenv("AWS_SESSION_TTL", "3600")),
description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).", description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
) )

View file

@ -4,6 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import asyncio
import base64 import base64
import struct import struct
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@ -43,9 +44,11 @@ class SentenceTransformerEmbeddingMixin:
task_type: EmbeddingTaskType | None = None, task_type: EmbeddingTaskType | None = None,
) -> EmbeddingsResponse: ) -> EmbeddingsResponse:
model = await self.model_store.get_model(model_id) model = await self.model_store.get_model(model_id)
embedding_model = self._load_sentence_transformer_model(model.provider_resource_id) embedding_model = await self._load_sentence_transformer_model(model.provider_resource_id)
embeddings = embedding_model.encode( embeddings = await asyncio.to_thread(
[interleaved_content_as_str(content) for content in contents], show_progress_bar=False embedding_model.encode,
[interleaved_content_as_str(content) for content in contents],
show_progress_bar=False,
) )
return EmbeddingsResponse(embeddings=embeddings) return EmbeddingsResponse(embeddings=embeddings)
@ -64,8 +67,8 @@ class SentenceTransformerEmbeddingMixin:
# Get the model and generate embeddings # Get the model and generate embeddings
model_obj = await self.model_store.get_model(model) model_obj = await self.model_store.get_model(model)
embedding_model = self._load_sentence_transformer_model(model_obj.provider_resource_id) embedding_model = await self._load_sentence_transformer_model(model_obj.provider_resource_id)
embeddings = embedding_model.encode(input_list, show_progress_bar=False) embeddings = await asyncio.to_thread(embedding_model.encode, input_list, show_progress_bar=False)
# Convert embeddings to the requested format # Convert embeddings to the requested format
data = [] data = []
@ -93,7 +96,7 @@ class SentenceTransformerEmbeddingMixin:
usage=usage, usage=usage,
) )
def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer": async def _load_sentence_transformer_model(self, model: str) -> "SentenceTransformer":
global EMBEDDING_MODELS global EMBEDDING_MODELS
loaded_model = EMBEDDING_MODELS.get(model) loaded_model = EMBEDDING_MODELS.get(model)
@ -101,8 +104,12 @@ class SentenceTransformerEmbeddingMixin:
return loaded_model return loaded_model
log.info(f"Loading sentence transformer for {model}...") log.info(f"Loading sentence transformer for {model}...")
from sentence_transformers import SentenceTransformer
loaded_model = SentenceTransformer(model) def _load_model():
from sentence_transformers import SentenceTransformer
return SentenceTransformer(model)
loaded_model = await asyncio.to_thread(_load_model)
EMBEDDING_MODELS[model] = loaded_model EMBEDDING_MODELS[model] = loaded_model
return loaded_model return loaded_model

View file

@ -3,6 +3,11 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import asyncio
from typing import Any
from sqlalchemy.exc import IntegrityError
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
ListOpenAIChatCompletionResponse, ListOpenAIChatCompletionResponse,
OpenAIChatCompletion, OpenAIChatCompletion,
@ -10,24 +15,43 @@ from llama_stack.apis.inference import (
OpenAIMessageParam, OpenAIMessageParam,
Order, Order,
) )
from llama_stack.core.datatypes import AccessRule from llama_stack.core.datatypes import AccessRule, InferenceStoreConfig
from llama_stack.core.utils.config_dirs import RUNTIME_BASE_DIR from llama_stack.log import get_logger
from ..sqlstore.api import ColumnDefinition, ColumnType from ..sqlstore.api import ColumnDefinition, ColumnType
from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore from ..sqlstore.authorized_sqlstore import AuthorizedSqlStore
from ..sqlstore.sqlstore import SqliteSqlStoreConfig, SqlStoreConfig, sqlstore_impl from ..sqlstore.sqlstore import SqlStoreConfig, SqlStoreType, sqlstore_impl
logger = get_logger(name=__name__, category="inference_store")
class InferenceStore: class InferenceStore:
def __init__(self, sql_store_config: SqlStoreConfig, policy: list[AccessRule]): def __init__(
if not sql_store_config: self,
sql_store_config = SqliteSqlStoreConfig( config: InferenceStoreConfig | SqlStoreConfig,
db_path=(RUNTIME_BASE_DIR / "sqlstore.db").as_posix(), policy: list[AccessRule],
):
# Handle backward compatibility
if not isinstance(config, InferenceStoreConfig):
# Legacy: SqlStoreConfig passed directly as config
config = InferenceStoreConfig(
sql_store_config=config,
) )
self.sql_store_config = sql_store_config
self.config = config
self.sql_store_config = config.sql_store_config
self.sql_store = None self.sql_store = None
self.policy = policy self.policy = policy
# Disable write queue for SQLite to avoid concurrency issues
self.enable_write_queue = self.sql_store_config.type != SqlStoreType.sqlite
# Async write queue and worker control
self._queue: asyncio.Queue[tuple[OpenAIChatCompletion, list[OpenAIMessageParam]]] | None = None
self._worker_tasks: list[asyncio.Task[Any]] = []
self._max_write_queue_size: int = config.max_write_queue_size
self._num_writers: int = max(1, config.num_writers)
async def initialize(self): async def initialize(self):
"""Create the necessary tables if they don't exist.""" """Create the necessary tables if they don't exist."""
self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config)) self.sql_store = AuthorizedSqlStore(sqlstore_impl(self.sql_store_config))
@ -42,23 +66,109 @@ class InferenceStore:
}, },
) )
if self.enable_write_queue:
self._queue = asyncio.Queue(maxsize=self._max_write_queue_size)
for _ in range(self._num_writers):
self._worker_tasks.append(asyncio.create_task(self._worker_loop()))
else:
logger.info("Write queue disabled for SQLite to avoid concurrency issues")
async def shutdown(self) -> None:
if not self._worker_tasks:
return
if self._queue is not None:
await self._queue.join()
for t in self._worker_tasks:
if not t.done():
t.cancel()
for t in self._worker_tasks:
try:
await t
except asyncio.CancelledError:
pass
self._worker_tasks.clear()
async def flush(self) -> None:
"""Wait for all queued writes to complete. Useful for testing."""
if self.enable_write_queue and self._queue is not None:
await self._queue.join()
async def store_chat_completion( async def store_chat_completion(
self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam] self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
) -> None: ) -> None:
if not self.sql_store: if self.enable_write_queue:
if self._queue is None:
raise ValueError("Inference store is not initialized")
try:
self._queue.put_nowait((chat_completion, input_messages))
except asyncio.QueueFull:
logger.warning(
f"Write queue full; adding chat completion id={getattr(chat_completion, 'id', '<unknown>')}"
)
await self._queue.put((chat_completion, input_messages))
else:
await self._write_chat_completion(chat_completion, input_messages)
async def _worker_loop(self) -> None:
assert self._queue is not None
while True:
try:
item = await self._queue.get()
except asyncio.CancelledError:
break
chat_completion, input_messages = item
try:
await self._write_chat_completion(chat_completion, input_messages)
except Exception as e: # noqa: BLE001
logger.error(f"Error writing chat completion: {e}")
finally:
self._queue.task_done()
async def _write_chat_completion(
self, chat_completion: OpenAIChatCompletion, input_messages: list[OpenAIMessageParam]
) -> None:
if self.sql_store is None:
raise ValueError("Inference store is not initialized") raise ValueError("Inference store is not initialized")
data = chat_completion.model_dump() data = chat_completion.model_dump()
record_data = {
"id": data["id"],
"created": data["created"],
"model": data["model"],
"choices": data["choices"],
"input_messages": [message.model_dump() for message in input_messages],
}
await self.sql_store.insert( try:
table="chat_completions", await self.sql_store.insert(
data={ table="chat_completions",
"id": data["id"], data=record_data,
"created": data["created"], )
"model": data["model"], except IntegrityError as e:
"choices": data["choices"], # Duplicate chat completion IDs can be generated during tests especially if they are replaying
"input_messages": [message.model_dump() for message in input_messages], # recorded responses across different tests. No need to warn or error under those circumstances.
}, # In the wild, this is not likely to happen at all (no evidence) so we aren't really hiding any problem.
# Check if it's a unique constraint violation
error_message = str(e.orig) if e.orig else str(e)
if self._is_unique_constraint_error(error_message):
# Update the existing record instead
await self.sql_store.update(table="chat_completions", data=record_data, where={"id": data["id"]})
else:
# Re-raise if it's not a unique constraint error
raise
def _is_unique_constraint_error(self, error_message: str) -> bool:
"""Check if the error is specifically a unique constraint violation."""
error_lower = error_message.lower()
return any(
indicator in error_lower
for indicator in [
"unique constraint failed", # SQLite
"duplicate key", # PostgreSQL
"unique violation", # PostgreSQL alternative
"duplicate entry", # MySQL
]
) )
async def list_chat_completions( async def list_chat_completions(

View file

@ -67,6 +67,17 @@ class OpenAIMixin(ABC):
""" """
pass pass
def get_extra_client_params(self) -> dict[str, Any]:
"""
Get any extra parameters to pass to the AsyncOpenAI client.
Child classes can override this method to provide additional parameters
such as timeout settings, proxies, etc.
:return: A dictionary of extra parameters
"""
return {}
@property @property
def client(self) -> AsyncOpenAI: def client(self) -> AsyncOpenAI:
""" """
@ -78,6 +89,7 @@ class OpenAIMixin(ABC):
return AsyncOpenAI( return AsyncOpenAI(
api_key=self.get_api_key(), api_key=self.get_api_key(),
base_url=self.get_base_url(), base_url=self.get_base_url(),
**self.get_extra_client_params(),
) )
async def _get_provider_model_id(self, model: str) -> str: async def _get_provider_model_id(self, model: str) -> str:
@ -124,10 +136,15 @@ class OpenAIMixin(ABC):
""" """
Direct OpenAI completion API call. Direct OpenAI completion API call.
""" """
if guided_choice is not None: # Handle parameters that are not supported by OpenAI API, but may be by the provider
logger.warning("guided_choice is not supported by the OpenAI API. Ignoring.") # prompt_logprobs is supported by vLLM
if prompt_logprobs is not None: # guided_choice is supported by vLLM
logger.warning("prompt_logprobs is not supported by the OpenAI API. Ignoring.") # TODO: test coverage
extra_body: dict[str, Any] = {}
if prompt_logprobs is not None and prompt_logprobs >= 0:
extra_body["prompt_logprobs"] = prompt_logprobs
if guided_choice:
extra_body["guided_choice"] = guided_choice
# TODO: fix openai_completion to return type compatible with OpenAI's API response # TODO: fix openai_completion to return type compatible with OpenAI's API response
return await self.client.completions.create( # type: ignore[no-any-return] return await self.client.completions.create( # type: ignore[no-any-return]
@ -150,7 +167,8 @@ class OpenAIMixin(ABC):
top_p=top_p, top_p=top_p,
user=user, user=user,
suffix=suffix, suffix=suffix,
) ),
extra_body=extra_body,
) )
async def openai_chat_completion( async def openai_chat_completion(

View file

@ -172,6 +172,20 @@ class AuthorizedSqlStore:
return results.data[0] if results.data else None return results.data[0] if results.data else None
async def update(self, table: str, data: Mapping[str, Any], where: Mapping[str, Any]) -> None:
"""Update rows with automatic access control attribute capture."""
enhanced_data = dict(data)
current_user = get_authenticated_user()
if current_user:
enhanced_data["owner_principal"] = current_user.principal
enhanced_data["access_attributes"] = current_user.attributes
else:
enhanced_data["owner_principal"] = None
enhanced_data["access_attributes"] = None
await self.sql_store.update(table, enhanced_data, where)
async def delete(self, table: str, where: Mapping[str, Any]) -> None: async def delete(self, table: str, where: Mapping[str, Any]) -> None:
"""Delete rows with automatic access control filtering.""" """Delete rows with automatic access control filtering."""
await self.sql_store.delete(table, where) await self.sql_store.delete(table, where)

View file

@ -18,6 +18,7 @@ from functools import wraps
from typing import Any from typing import Any
from llama_stack.apis.telemetry import ( from llama_stack.apis.telemetry import (
Event,
LogSeverity, LogSeverity,
Span, Span,
SpanEndPayload, SpanEndPayload,
@ -98,7 +99,7 @@ class BackgroundLogger:
def __init__(self, api: Telemetry, capacity: int = 100000): def __init__(self, api: Telemetry, capacity: int = 100000):
self.api = api self.api = api
self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity) self.log_queue: queue.Queue[Any] = queue.Queue(maxsize=capacity)
self.worker_thread = threading.Thread(target=self._process_logs, daemon=True) self.worker_thread = threading.Thread(target=self._worker, daemon=True)
self.worker_thread.start() self.worker_thread.start()
self._last_queue_full_log_time: float = 0.0 self._last_queue_full_log_time: float = 0.0
self._dropped_since_last_notice: int = 0 self._dropped_since_last_notice: int = 0
@ -118,12 +119,16 @@ class BackgroundLogger:
self._last_queue_full_log_time = current_time self._last_queue_full_log_time = current_time
self._dropped_since_last_notice = 0 self._dropped_since_last_notice = 0
def _process_logs(self): def _worker(self):
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(self._process_logs())
async def _process_logs(self):
while True: while True:
try: try:
event = self.log_queue.get() event = self.log_queue.get()
# figure out how to use a thread's native loop await self.api.log_event(event)
asyncio.run(self.api.log_event(event))
except Exception: except Exception:
import traceback import traceback
@ -136,6 +141,19 @@ class BackgroundLogger:
self.log_queue.join() self.log_queue.join()
def enqueue_event(event: Event) -> None:
"""Enqueue a telemetry event to the background logger if available.
This provides a non-blocking path for routers and other hot paths to
submit telemetry without awaiting the Telemetry API, reducing contention
with the main event loop.
"""
global BACKGROUND_LOGGER
if BACKGROUND_LOGGER is None:
raise RuntimeError("Telemetry API not initialized")
BACKGROUND_LOGGER.log_event(event)
class TraceContext: class TraceContext:
spans: list[Span] = [] spans: list[Span] = []
@ -256,11 +274,7 @@ class TelemetryHandler(logging.Handler):
if record.module in ("asyncio", "selector_events"): if record.module in ("asyncio", "selector_events"):
return return
global CURRENT_TRACE_CONTEXT, BACKGROUND_LOGGER global CURRENT_TRACE_CONTEXT
if BACKGROUND_LOGGER is None:
raise RuntimeError("Telemetry API not initialized")
context = CURRENT_TRACE_CONTEXT.get() context = CURRENT_TRACE_CONTEXT.get()
if context is None: if context is None:
return return
@ -269,7 +283,7 @@ class TelemetryHandler(logging.Handler):
if span is None: if span is None:
return return
BACKGROUND_LOGGER.log_event( enqueue_event(
UnstructuredLogEvent( UnstructuredLogEvent(
trace_id=span.trace_id, trace_id=span.trace_id,
span_id=span.span_id, span_id=span.span_id,

View file

@ -67,6 +67,38 @@ async def client_wrapper(endpoint: str, headers: dict[str, str]) -> AsyncGenerat
raise AuthenticationRequiredError(exc) from exc raise AuthenticationRequiredError(exc) from exc
if i == len(connection_strategies) - 1: if i == len(connection_strategies) - 1:
raise raise
except* httpx.ConnectError as eg:
# Connection refused, server down, network unreachable
if i == len(connection_strategies) - 1:
error_msg = f"Failed to connect to MCP server at {endpoint}: Connection refused"
logger.error(f"MCP connection error: {error_msg}")
raise ConnectionError(error_msg) from eg
else:
logger.warning(
f"failed to connect to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
)
except* httpx.TimeoutException as eg:
# Request timeout, server too slow
if i == len(connection_strategies) - 1:
error_msg = f"MCP server at {endpoint} timed out"
logger.error(f"MCP timeout error: {error_msg}")
raise TimeoutError(error_msg) from eg
else:
logger.warning(
f"MCP server at {endpoint} timed out via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
)
except* httpx.RequestError as eg:
# DNS resolution failures, network errors, invalid URLs
if i == len(connection_strategies) - 1:
# Get the first exception's message for the error string
exc_msg = str(eg.exceptions[0]) if eg.exceptions else "Unknown error"
error_msg = f"Network error connecting to MCP server at {endpoint}: {exc_msg}"
logger.error(f"MCP network error: {error_msg}")
raise ConnectionError(error_msg) from eg
else:
logger.warning(
f"network error connecting to MCP server at {endpoint} via {strategy.name}, falling back to {connection_strategies[i + 1].name}"
)
except* McpError: except* McpError:
if i < len(connection_strategies) - 1: if i < len(connection_strategies) - 1:
logger.warning( logger.warning(

View file

@ -30,6 +30,9 @@ from openai.types.completion_choice import CompletionChoice
CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "length", "content_filter"] | None
CompletionChoice.model_rebuild() CompletionChoice.model_rebuild()
REPO_ROOT = Path(__file__).parent.parent.parent
DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/recordings"
class InferenceMode(StrEnum): class InferenceMode(StrEnum):
LIVE = "live" LIVE = "live"
@ -51,7 +54,7 @@ def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict
def get_inference_mode() -> InferenceMode: def get_inference_mode() -> InferenceMode:
return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "live").lower()) return InferenceMode(os.environ.get("LLAMA_STACK_TEST_INFERENCE_MODE", "replay").lower())
def setup_inference_recording(): def setup_inference_recording():
@ -60,28 +63,18 @@ def setup_inference_recording():
to increase their reliability and reduce reliance on expensive, external services. to increase their reliability and reduce reliance on expensive, external services.
Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases. Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases.
Calls to the /models endpoint are not currently trapped. We probably need to add support for this.
Two environment variables are required: Two environment variables are supported:
- LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. Default is 'replay'.
- LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.
The recordings are stored in a SQLite database and a JSON file for each request. The SQLite database is used to The recordings are stored as JSON files.
quickly find the correct recording for a given request. The JSON files are used to store the request and response
bodies.
""" """
mode = get_inference_mode() mode = get_inference_mode()
if mode not in InferenceMode:
raise ValueError(f"Invalid LLAMA_STACK_TEST_INFERENCE_MODE: {mode}. Must be 'live', 'record', or 'replay'")
if mode == InferenceMode.LIVE: if mode == InferenceMode.LIVE:
return None return None
if "LLAMA_STACK_TEST_RECORDING_DIR" not in os.environ: storage_dir = os.environ.get("LLAMA_STACK_TEST_RECORDING_DIR", DEFAULT_STORAGE_DIR)
raise ValueError("LLAMA_STACK_TEST_RECORDING_DIR must be set for recording or replaying")
storage_dir = os.environ["LLAMA_STACK_TEST_RECORDING_DIR"]
return inference_recording(mode=mode, storage_dir=storage_dir) return inference_recording(mode=mode, storage_dir=storage_dir)
@ -112,8 +105,12 @@ def _deserialize_response(data: dict[str, Any]) -> Any:
return cls.model_validate(data["__data__"]) return cls.model_validate(data["__data__"])
except (ImportError, AttributeError, TypeError, ValueError) as e: except (ImportError, AttributeError, TypeError, ValueError) as e:
logger.warning(f"Failed to deserialize object of type {data['__type__']}: {e}") logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_validate: {e}")
return data["__data__"] try:
return cls.model_construct(**data["__data__"])
except Exception as e:
logger.warning(f"Failed to deserialize object of type {data['__type__']} with model_construct: {e}")
return data["__data__"]
return data return data
@ -134,8 +131,8 @@ class ResponseStorage:
def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]): def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
"""Store a request/response pair.""" """Store a request/response pair."""
# Generate unique response filename # Generate unique response filename
response_file = f"{request_hash[:12]}.json" short_hash = request_hash[:12]
response_path = self.responses_dir / response_file response_file = f"{short_hash}.json"
# Serialize response body if needed # Serialize response body if needed
serialized_response = dict(response) serialized_response = dict(response)
@ -147,6 +144,14 @@ class ResponseStorage:
# Handle single response # Handle single response
serialized_response["body"] = _serialize_response(serialized_response["body"]) serialized_response["body"] = _serialize_response(serialized_response["body"])
# If this is an Ollama /api/tags recording, include models digest in filename to distinguish variants
endpoint = request.get("endpoint")
if endpoint in ("/api/tags", "/v1/models"):
digest = _model_identifiers_digest(endpoint, response)
response_file = f"models-{short_hash}-{digest}.json"
response_path = self.responses_dir / response_file
# Save response to JSON file # Save response to JSON file
with open(response_path, "w") as f: with open(response_path, "w") as f:
json.dump({"request": request, "response": serialized_response}, f, indent=2) json.dump({"request": request, "response": serialized_response}, f, indent=2)
@ -161,19 +166,85 @@ class ResponseStorage:
if not response_path.exists(): if not response_path.exists():
return None return None
with open(response_path) as f: return _recording_from_file(response_path)
data = json.load(f)
# Deserialize response body if needed def _model_list_responses(self, short_hash: str) -> list[dict[str, Any]]:
if "response" in data and "body" in data["response"]: results: list[dict[str, Any]] = []
if isinstance(data["response"]["body"], list): for path in self.responses_dir.glob(f"models-{short_hash}-*.json"):
# Handle streaming responses data = _recording_from_file(path)
data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]] results.append(data)
return results
def _recording_from_file(response_path) -> dict[str, Any]:
with open(response_path) as f:
data = json.load(f)
# Deserialize response body if needed
if "response" in data and "body" in data["response"]:
if isinstance(data["response"]["body"], list):
# Handle streaming responses
data["response"]["body"] = [_deserialize_response(chunk) for chunk in data["response"]["body"]]
else:
# Handle single response
data["response"]["body"] = _deserialize_response(data["response"]["body"])
return cast(dict[str, Any], data)
def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
def _extract_model_identifiers():
"""Extract a stable set of identifiers for model-list endpoints.
Supported endpoints:
- '/api/tags' (Ollama): response body has 'models': [ { name/model/digest/id/... }, ... ]
- '/v1/models' (OpenAI): response body has 'data': [ { id: ... }, ... ]
Returns a list of unique identifiers or None if structure doesn't match.
"""
body = response["body"]
if endpoint == "/api/tags":
items = body.get("models")
idents = [m.model for m in items]
else:
items = body.get("data")
idents = [m.id for m in items]
return sorted(set(idents))
identifiers = _extract_model_identifiers()
return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
"""Return a single, unioned recording for supported model-list endpoints."""
seen: dict[str, dict[str, Any]] = {}
for rec in records:
body = rec["response"]["body"]
if endpoint == "/api/tags":
items = body.models
elif endpoint == "/v1/models":
items = body.data
else:
items = []
for m in items:
if endpoint == "/v1/models":
key = m.id
else: else:
# Handle single response key = m.model
data["response"]["body"] = _deserialize_response(data["response"]["body"]) seen[key] = m
return cast(dict[str, Any], data) ordered = [seen[k] for k in sorted(seen.keys())]
canonical = records[0]
canonical_req = canonical.get("request", {})
if isinstance(canonical_req, dict):
canonical_req["endpoint"] = endpoint
if endpoint == "/v1/models":
body = {"data": ordered, "object": "list"}
else:
from ollama import ListResponse
body = ListResponse(models=ordered)
return {"request": canonical_req, "response": {"body": body, "is_streaming": False}}
async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs): async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
@ -195,8 +266,6 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
raise ValueError(f"Unknown client type: {client_type}") raise ValueError(f"Unknown client type: {client_type}")
url = base_url.rstrip("/") + endpoint url = base_url.rstrip("/") + endpoint
# Normalize request for matching
method = "POST" method = "POST"
headers = {} headers = {}
body = kwargs body = kwargs
@ -204,7 +273,12 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
request_hash = normalize_request(method, url, headers, body) request_hash = normalize_request(method, url, headers, body)
if _current_mode == InferenceMode.REPLAY: if _current_mode == InferenceMode.REPLAY:
recording = _current_storage.find_recording(request_hash) # Special handling for model-list endpoints: return union of all responses
if endpoint in ("/api/tags", "/v1/models"):
records = _current_storage._model_list_responses(request_hash[:12])
recording = _combine_model_list_responses(endpoint, records)
else:
recording = _current_storage.find_recording(request_hash)
if recording: if recording:
response_body = recording["response"]["body"] response_body = recording["response"]["body"]
@ -222,7 +296,7 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
f"No recorded response found for request hash: {request_hash}\n" f"No recorded response found for request hash: {request_hash}\n"
f"Request: {method} {url} {body}\n" f"Request: {method} {url} {body}\n"
f"Model: {body.get('model', 'unknown')}\n" f"Model: {body.get('model', 'unknown')}\n"
f"To record this response, run with LLAMA_STACK_INFERENCE_MODE=record" f"To record this response, run with LLAMA_STACK_TEST_INFERENCE_MODE=record"
) )
elif _current_mode == InferenceMode.RECORD: elif _current_mode == InferenceMode.RECORD:
@ -274,12 +348,14 @@ def patch_inference_clients():
from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
from openai.resources.completions import AsyncCompletions from openai.resources.completions import AsyncCompletions
from openai.resources.embeddings import AsyncEmbeddings from openai.resources.embeddings import AsyncEmbeddings
from openai.resources.models import AsyncModels
# Store original methods for both OpenAI and Ollama clients # Store original methods for both OpenAI and Ollama clients
_original_methods = { _original_methods = {
"chat_completions_create": AsyncChatCompletions.create, "chat_completions_create": AsyncChatCompletions.create,
"completions_create": AsyncCompletions.create, "completions_create": AsyncCompletions.create,
"embeddings_create": AsyncEmbeddings.create, "embeddings_create": AsyncEmbeddings.create,
"models_list": AsyncModels.list,
"ollama_generate": OllamaAsyncClient.generate, "ollama_generate": OllamaAsyncClient.generate,
"ollama_chat": OllamaAsyncClient.chat, "ollama_chat": OllamaAsyncClient.chat,
"ollama_embed": OllamaAsyncClient.embed, "ollama_embed": OllamaAsyncClient.embed,
@ -304,10 +380,16 @@ def patch_inference_clients():
_original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs _original_methods["embeddings_create"], self, "openai", "/v1/embeddings", *args, **kwargs
) )
async def patched_models_list(self, *args, **kwargs):
return await _patched_inference_method(
_original_methods["models_list"], self, "openai", "/v1/models", *args, **kwargs
)
# Apply OpenAI patches # Apply OpenAI patches
AsyncChatCompletions.create = patched_chat_completions_create AsyncChatCompletions.create = patched_chat_completions_create
AsyncCompletions.create = patched_completions_create AsyncCompletions.create = patched_completions_create
AsyncEmbeddings.create = patched_embeddings_create AsyncEmbeddings.create = patched_embeddings_create
AsyncModels.list = patched_models_list
# Create patched methods for Ollama client # Create patched methods for Ollama client
async def patched_ollama_generate(self, *args, **kwargs): async def patched_ollama_generate(self, *args, **kwargs):
@ -361,11 +443,13 @@ def unpatch_inference_clients():
from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
from openai.resources.completions import AsyncCompletions from openai.resources.completions import AsyncCompletions
from openai.resources.embeddings import AsyncEmbeddings from openai.resources.embeddings import AsyncEmbeddings
from openai.resources.models import AsyncModels
# Restore OpenAI client methods # Restore OpenAI client methods
AsyncChatCompletions.create = _original_methods["chat_completions_create"] AsyncChatCompletions.create = _original_methods["chat_completions_create"]
AsyncCompletions.create = _original_methods["completions_create"] AsyncCompletions.create = _original_methods["completions_create"]
AsyncEmbeddings.create = _original_methods["embeddings_create"] AsyncEmbeddings.create = _original_methods["embeddings_create"]
AsyncModels.list = _original_methods["models_list"]
# Restore Ollama client methods if they were patched # Restore Ollama client methods if they were patched
OllamaAsyncClient.generate = _original_methods["ollama_generate"] OllamaAsyncClient.generate = _original_methods["ollama_generate"]
@ -379,16 +463,10 @@ def unpatch_inference_clients():
@contextmanager @contextmanager
def inference_recording(mode: str = "live", storage_dir: str | Path | None = None) -> Generator[None, None, None]: def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Generator[None, None, None]:
"""Context manager for inference recording/replaying.""" """Context manager for inference recording/replaying."""
global _current_mode, _current_storage global _current_mode, _current_storage
# Set defaults
if storage_dir is None:
storage_dir_path = Path.home() / ".llama" / "recordings"
else:
storage_dir_path = Path(storage_dir)
# Store previous state # Store previous state
prev_mode = _current_mode prev_mode = _current_mode
prev_storage = _current_storage prev_storage = _current_storage
@ -397,7 +475,9 @@ def inference_recording(mode: str = "live", storage_dir: str | Path | None = Non
_current_mode = mode _current_mode = mode
if mode in ["record", "replay"]: if mode in ["record", "replay"]:
_current_storage = ResponseStorage(storage_dir_path) if storage_dir is None:
raise ValueError("storage_dir is required for record and replay modes")
_current_storage = ResponseStorage(Path(storage_dir))
patch_inference_clients() patch_inference_clients()
yield yield

View file

@ -10,7 +10,7 @@
"dependencies": { "dependencies": {
"@radix-ui/react-collapsible": "^1.1.12", "@radix-ui/react-collapsible": "^1.1.12",
"@radix-ui/react-dialog": "^1.1.13", "@radix-ui/react-dialog": "^1.1.13",
"@radix-ui/react-dropdown-menu": "^2.1.14", "@radix-ui/react-dropdown-menu": "^2.1.16",
"@radix-ui/react-select": "^2.2.5", "@radix-ui/react-select": "^2.2.5",
"@radix-ui/react-separator": "^1.1.7", "@radix-ui/react-separator": "^1.1.7",
"@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-slot": "^1.2.3",
@ -18,18 +18,18 @@
"class-variance-authority": "^0.7.1", "class-variance-authority": "^0.7.1",
"clsx": "^2.1.1", "clsx": "^2.1.1",
"framer-motion": "^12.23.12", "framer-motion": "^12.23.12",
"llama-stack-client": "^0.2.20", "llama-stack-client": "^0.2.21",
"lucide-react": "^0.510.0", "lucide-react": "^0.542.0",
"next": "15.3.3", "next": "15.3.3",
"next-auth": "^4.24.11", "next-auth": "^4.24.11",
"next-themes": "^0.4.6", "next-themes": "^0.4.6",
"react": "^19.0.0", "react": "^19.0.0",
"react-dom": "^19.0.0", "react-dom": "^19.1.1",
"react-markdown": "^10.1.0", "react-markdown": "^10.1.0",
"remark-gfm": "^4.0.1", "remark-gfm": "^4.0.1",
"remeda": "^2.30.0", "remeda": "^2.30.0",
"shiki": "^1.29.2", "shiki": "^1.29.2",
"sonner": "^2.0.6", "sonner": "^2.0.7",
"tailwind-merge": "^3.3.1" "tailwind-merge": "^3.3.1"
}, },
"devDependencies": { "devDependencies": {
@ -2066,12 +2066,35 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/@radix-ui/react-arrow": { "node_modules/@radix-ui/react-arrow": {
"version": "1.1.6", "version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.6.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
"integrity": "sha512-2JMfHJf/eVnwq+2dewT3C0acmCWD3XiVA1Da+jTDqo342UlU13WvXtqHhG+yJw5JeQmu4ue2eMy6gcEArLBlcw==", "integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/react-primitive": "2.1.2" "@radix-ui/react-primitive": "2.1.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-arrow/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
}, },
"peerDependencies": { "peerDependencies": {
"@types/react": "*", "@types/react": "*",
@ -2172,15 +2195,15 @@
} }
}, },
"node_modules/@radix-ui/react-collection": { "node_modules/@radix-ui/react-collection": {
"version": "1.1.6", "version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.6.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz",
"integrity": "sha512-PbhRFK4lIEw9ADonj48tiYWzkllz81TM7KVYyyMMw2cwHO7D5h4XKEblL8NlaRisTK3QTe6tBEhDccFUryxHBQ==", "integrity": "sha512-Fh9rGN0MoI4ZFUNyfFVNU4y9LUz93u9/0K+yLgA2bwRojxM8JU1DyvvMBabnZPBgMWREAJvU2jjVzq+LrFUglw==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-slot": "1.2.2" "@radix-ui/react-slot": "1.2.3"
}, },
"peerDependencies": { "peerDependencies": {
"@types/react": "*", "@types/react": "*",
@ -2197,21 +2220,26 @@
} }
} }
}, },
"node_modules/@radix-ui/react-collection/node_modules/@radix-ui/react-slot": { "node_modules/@radix-ui/react-collection/node_modules/@radix-ui/react-primitive": {
"version": "1.2.2", "version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.2.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==", "integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/react-compose-refs": "1.1.2" "@radix-ui/react-slot": "1.2.3"
}, },
"peerDependencies": { "peerDependencies": {
"@types/react": "*", "@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" "@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
}, },
"peerDependenciesMeta": { "peerDependenciesMeta": {
"@types/react": { "@types/react": {
"optional": true "optional": true
},
"@types/react-dom": {
"optional": true
} }
} }
}, },
@ -2342,17 +2370,17 @@
} }
}, },
"node_modules/@radix-ui/react-dropdown-menu": { "node_modules/@radix-ui/react-dropdown-menu": {
"version": "2.1.14", "version": "2.1.16",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dropdown-menu/-/react-dropdown-menu-2.1.14.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-dropdown-menu/-/react-dropdown-menu-2.1.16.tgz",
"integrity": "sha512-lzuyNjoWOoaMFE/VC5FnAAYM16JmQA8ZmucOXtlhm2kKR5TSU95YLAueQ4JYuRmUJmBvSqXaVFGIfuukybwZJQ==", "integrity": "sha512-1PLGQEynI/3OX/ftV54COn+3Sud/Mn8vALg2rWnBLnRaGtJDduNW/22XjlGgPdpcIbiQxjKtb7BkcjP00nqfJw==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/primitive": "1.1.2", "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-id": "1.1.1", "@radix-ui/react-id": "1.1.1",
"@radix-ui/react-menu": "2.1.14", "@radix-ui/react-menu": "2.1.16",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-controllable-state": "1.2.2" "@radix-ui/react-use-controllable-state": "1.2.2"
}, },
"peerDependencies": { "peerDependencies": {
@ -2370,6 +2398,35 @@
} }
} }
}, },
"node_modules/@radix-ui/react-dropdown-menu/node_modules/@radix-ui/primitive": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
"integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT"
},
"node_modules/@radix-ui/react-dropdown-menu/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-focus-guards": { "node_modules/@radix-ui/react-focus-guards": {
"version": "1.1.2", "version": "1.1.2",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.2.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.2.tgz",
@ -2429,26 +2486,26 @@
} }
}, },
"node_modules/@radix-ui/react-menu": { "node_modules/@radix-ui/react-menu": {
"version": "2.1.14", "version": "2.1.16",
"resolved": "https://registry.npmjs.org/@radix-ui/react-menu/-/react-menu-2.1.14.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-menu/-/react-menu-2.1.16.tgz",
"integrity": "sha512-0zSiBAIFq9GSKoSH5PdEaQeRB3RnEGxC+H2P0egtnKoKKLNBH8VBHyVO6/jskhjAezhOIplyRUj7U2lds9A+Yg==", "integrity": "sha512-72F2T+PLlphrqLcAotYPp0uJMr5SjP5SL01wfEspJbru5Zs5vQaSHb4VB3ZMJPimgHHCHG7gMOeOB9H3Hdmtxg==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/primitive": "1.1.2", "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-collection": "1.1.6", "@radix-ui/react-collection": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-direction": "1.1.1", "@radix-ui/react-direction": "1.1.1",
"@radix-ui/react-dismissable-layer": "1.1.9", "@radix-ui/react-dismissable-layer": "1.1.11",
"@radix-ui/react-focus-guards": "1.1.2", "@radix-ui/react-focus-guards": "1.1.3",
"@radix-ui/react-focus-scope": "1.1.6", "@radix-ui/react-focus-scope": "1.1.7",
"@radix-ui/react-id": "1.1.1", "@radix-ui/react-id": "1.1.1",
"@radix-ui/react-popper": "1.2.6", "@radix-ui/react-popper": "1.2.8",
"@radix-ui/react-portal": "1.1.8", "@radix-ui/react-portal": "1.1.9",
"@radix-ui/react-presence": "1.1.4", "@radix-ui/react-presence": "1.1.5",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-roving-focus": "1.1.9", "@radix-ui/react-roving-focus": "1.1.11",
"@radix-ui/react-slot": "1.2.2", "@radix-ui/react-slot": "1.2.3",
"@radix-ui/react-use-callback-ref": "1.1.1", "@radix-ui/react-use-callback-ref": "1.1.1",
"aria-hidden": "^1.2.4", "aria-hidden": "^1.2.4",
"react-remove-scroll": "^2.6.3" "react-remove-scroll": "^2.6.3"
@ -2468,14 +2525,44 @@
} }
} }
}, },
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-slot": { "node_modules/@radix-ui/react-menu/node_modules/@radix-ui/primitive": {
"version": "1.2.2", "version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.2.2.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
"integrity": "sha512-y7TBO4xN4Y94FvcWIOIh18fM4R1A8S4q1jhoz4PNzOoHsFcN8pogcFmZrTYAm4F9VRUrWP/Mw7xSKybIeRI+CQ==", "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT"
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-dismissable-layer": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
"integrity": "sha512-Nqcp+t5cTB8BinFkZgXiMJniQH0PsUt2k51FUhbdfeKvc4ACcG2uQniY/8+h1Yv6Kza4Q7lD7PQV0z0oicE0Mg==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/react-compose-refs": "1.1.2" "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-escape-keydown": "1.1.1"
}, },
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-focus-guards": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.1.3.tgz",
"integrity": "sha512-0rFg/Rj2Q62NCm62jZw0QX7a3sz6QCQU0LpZdNrJX8byRGaGVTqbrW9jAoIAHyMQqsNpeZ81YgSizOt5WXq0Pw==",
"license": "MIT",
"peerDependencies": { "peerDependencies": {
"@types/react": "*", "@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
@ -2486,17 +2573,113 @@
} }
} }
}, },
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-focus-scope": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.1.7.tgz",
"integrity": "sha512-t2ODlkXBQyn7jkl6TNaw/MtVEVvIGelJDCG41Okq/KwUsJBwQ4XVZsHAVUkK4mBv3ewiAS3PGuUWuY2BoK4ZUw==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-portal": {
"version": "1.1.9",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
"integrity": "sha512-bpIxvq03if6UNwXZ+HTK71JLh4APvnXntDc6XOX8UVq4XQOVl7lwok0AvIl+b8zgCw3fSaVTZMpAPPagXbKmHQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-layout-effect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-presence": {
"version": "1.1.5",
"resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.1.5.tgz",
"integrity": "sha512-/jfEwNDdQVBCNvjkGit4h6pMOzq8bHkopq458dPt2lMjx+eBQUohZNG9A7DtO/O5ukSbxuaNGXMjHicgwy6rQQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-use-layout-effect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-menu/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-popper": { "node_modules/@radix-ui/react-popper": {
"version": "1.2.6", "version": "1.2.8",
"resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.6.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
"integrity": "sha512-7iqXaOWIjDBfIG7aq8CUEeCSsQMLFdn7VEE8TaFz704DtEzpPHR7w/uuzRflvKgltqSAImgcmxQ7fFX3X7wasg==", "integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@floating-ui/react-dom": "^2.0.0", "@floating-ui/react-dom": "^2.0.0",
"@radix-ui/react-arrow": "1.1.6", "@radix-ui/react-arrow": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1", "@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-layout-effect": "1.1.1", "@radix-ui/react-use-layout-effect": "1.1.1",
"@radix-ui/react-use-rect": "1.1.1", "@radix-ui/react-use-rect": "1.1.1",
@ -2518,6 +2701,29 @@
} }
} }
}, },
"node_modules/@radix-ui/react-popper/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-portal": { "node_modules/@radix-ui/react-portal": {
"version": "1.1.8", "version": "1.1.8",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.8.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.8.tgz",
@ -2608,18 +2814,18 @@
} }
}, },
"node_modules/@radix-ui/react-roving-focus": { "node_modules/@radix-ui/react-roving-focus": {
"version": "1.1.9", "version": "1.1.11",
"resolved": "https://registry.npmjs.org/@radix-ui/react-roving-focus/-/react-roving-focus-1.1.9.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-roving-focus/-/react-roving-focus-1.1.11.tgz",
"integrity": "sha512-ZzrIFnMYHHCNqSNCsuN6l7wlewBEq0O0BCSBkabJMFXVO51LRUTq71gLP1UxFvmrXElqmPjA5VX7IqC9VpazAQ==", "integrity": "sha512-7A6S9jSgm/S+7MdtNDSb+IU859vQqJ/QAtcYQcfFC6W8RS4IxIZDldLR0xqCFZ6DCyrQLjLPsxtTNch5jVA4lA==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@radix-ui/primitive": "1.1.2", "@radix-ui/primitive": "1.1.3",
"@radix-ui/react-collection": "1.1.6", "@radix-ui/react-collection": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2", "@radix-ui/react-context": "1.1.2",
"@radix-ui/react-direction": "1.1.1", "@radix-ui/react-direction": "1.1.1",
"@radix-ui/react-id": "1.1.1", "@radix-ui/react-id": "1.1.1",
"@radix-ui/react-primitive": "2.1.2", "@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1", "@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-controllable-state": "1.2.2" "@radix-ui/react-use-controllable-state": "1.2.2"
}, },
@ -2638,6 +2844,35 @@
} }
} }
}, },
"node_modules/@radix-ui/react-roving-focus/node_modules/@radix-ui/primitive": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.1.3.tgz",
"integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT"
},
"node_modules/@radix-ui/react-roving-focus/node_modules/@radix-ui/react-primitive": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-2.1.3.tgz",
"integrity": "sha512-m9gTwRkhy2lvCPe6QJp4d3G1TYEUHn/FzJUtq9MjH46an1wJU+GdoGC5VLof8RX8Ft/DlpshApkhswDLZzHIcQ==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-select": { "node_modules/@radix-ui/react-select": {
"version": "2.2.5", "version": "2.2.5",
"resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-select/-/react-select-2.2.5.tgz",
@ -2681,55 +2916,6 @@
} }
} }
}, },
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-arrow": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
"integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-primitive": "2.1.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-collection": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz",
"integrity": "sha512-Fh9rGN0MoI4ZFUNyfFVNU4y9LUz93u9/0K+yLgA2bwRojxM8JU1DyvvMBabnZPBgMWREAJvU2jjVzq+LrFUglw==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-slot": "1.2.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": { "node_modules/@radix-ui/react-select/node_modules/@radix-ui/react-dismissable-layer": {
"version": "1.1.10", "version": "1.1.10",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.10.tgz",
@ -2965,29 +3151,6 @@
"integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-arrow": {
"version": "1.1.7",
"resolved": "https://registry.npmjs.org/@radix-ui/react-arrow/-/react-arrow-1.1.7.tgz",
"integrity": "sha512-F+M1tLhO+mlQaOWspE8Wstg+z6PwxwRd8oQ8IXceWz92kfAmalTRf0EjrouQeo7QssEPfCn05B4Ihs1K9WQ/7w==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-primitive": "2.1.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-dismissable-layer": { "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-dismissable-layer": {
"version": "1.1.11", "version": "1.1.11",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.1.11.tgz",
@ -3015,38 +3178,6 @@
} }
} }
}, },
"node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-popper": {
"version": "1.2.8",
"resolved": "https://registry.npmjs.org/@radix-ui/react-popper/-/react-popper-1.2.8.tgz",
"integrity": "sha512-0NJQ4LFFUuWkE7Oxf0htBKS6zLkkjBH+hM1uk7Ng705ReR8m/uelduy1DBo0PyBXPKVnBA6YBlU94MBGXrSBCw==",
"license": "MIT",
"dependencies": {
"@floating-ui/react-dom": "^2.0.0",
"@radix-ui/react-arrow": "1.1.7",
"@radix-ui/react-compose-refs": "1.1.2",
"@radix-ui/react-context": "1.1.2",
"@radix-ui/react-primitive": "2.1.3",
"@radix-ui/react-use-callback-ref": "1.1.1",
"@radix-ui/react-use-layout-effect": "1.1.1",
"@radix-ui/react-use-rect": "1.1.1",
"@radix-ui/react-use-size": "1.1.1",
"@radix-ui/rect": "1.1.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc",
"react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-portal": { "node_modules/@radix-ui/react-tooltip/node_modules/@radix-ui/react-portal": {
"version": "1.1.9", "version": "1.1.9",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz", "resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.1.9.tgz",
@ -3447,6 +3578,13 @@
"tailwindcss": "4.1.6" "tailwindcss": "4.1.6"
} }
}, },
"node_modules/@tailwindcss/node/node_modules/tailwindcss": {
"version": "4.1.6",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
"dev": true,
"license": "MIT"
},
"node_modules/@tailwindcss/oxide": { "node_modules/@tailwindcss/oxide": {
"version": "4.1.6", "version": "4.1.6",
"resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz", "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.6.tgz",
@ -3707,6 +3845,13 @@
"tailwindcss": "4.1.6" "tailwindcss": "4.1.6"
} }
}, },
"node_modules/@tailwindcss/postcss/node_modules/tailwindcss": {
"version": "4.1.6",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz",
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==",
"dev": true,
"license": "MIT"
},
"node_modules/@testing-library/dom": { "node_modules/@testing-library/dom": {
"version": "10.4.1", "version": "10.4.1",
"resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz", "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz",
@ -4079,9 +4224,9 @@
} }
}, },
"node_modules/@types/react-dom": { "node_modules/@types/react-dom": {
"version": "19.1.5", "version": "19.1.9",
"resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.5.tgz", "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.9.tgz",
"integrity": "sha512-CMCjrWucUBZvohgZxkjd6S9h0nZxXjzus6yDfUb+xLxYM7VvjKNH1tQrE9GWLql1XoOP4/Ds3bwFqShHUYraGg==", "integrity": "sha512-qXRuZaOsAdXKFyOhRBg6Lqqc0yay13vN7KrIg4L7N4aaHN68ma9OK3NE1BoDFgFOTfM7zg+3/8+2n8rLUH3OKQ==",
"devOptional": true, "devOptional": true,
"license": "MIT", "license": "MIT",
"peerDependencies": { "peerDependencies": {
@ -10147,9 +10292,9 @@
"license": "MIT" "license": "MIT"
}, },
"node_modules/llama-stack-client": { "node_modules/llama-stack-client": {
"version": "0.2.20", "version": "0.2.21",
"resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.20.tgz", "resolved": "https://registry.npmjs.org/llama-stack-client/-/llama-stack-client-0.2.21.tgz",
"integrity": "sha512-1vD5nizTX5JEW8TADxKgy/P1W8YZoPSpdnmfxbdYbWgpQ3BWtbvLS6jmDk7VwVA5fRC4895VfHsRDfS1liHarw==", "integrity": "sha512-rjU2Vx5xStxDYavU8K1An/SYXiQQjroLcK98B+p0Paz/a7OgRao2S0YwvThJjPUyChY4fO03UIXP9LpmHqlXWQ==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@types/node": "^18.11.18", "@types/node": "^18.11.18",
@ -10240,9 +10385,9 @@
"license": "ISC" "license": "ISC"
}, },
"node_modules/lucide-react": { "node_modules/lucide-react": {
"version": "0.510.0", "version": "0.542.0",
"resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.510.0.tgz", "resolved": "https://registry.npmjs.org/lucide-react/-/lucide-react-0.542.0.tgz",
"integrity": "sha512-p8SQRAMVh7NhsAIETokSqDrc5CHnDLbV29mMnzaXx+Vc/hnqQzwI2r0FMWCcoTXnbw2KEjy48xwpGdEL+ck06Q==", "integrity": "sha512-w3hD8/SQB7+lzU2r4VdFyzzOzKnUjTZIF/MQJGSSvni7Llewni4vuViRppfRAa2guOsY5k4jZyxw/i9DQHv+dw==",
"license": "ISC", "license": "ISC",
"peerDependencies": { "peerDependencies": {
"react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0" "react": "^16.5.1 || ^17.0.0 || ^18.0.0 || ^19.0.0"
@ -12448,24 +12593,24 @@
} }
}, },
"node_modules/react": { "node_modules/react": {
"version": "19.1.0", "version": "19.1.1",
"resolved": "https://registry.npmjs.org/react/-/react-19.1.0.tgz", "resolved": "https://registry.npmjs.org/react/-/react-19.1.1.tgz",
"integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==", "integrity": "sha512-w8nqGImo45dmMIfljjMwOGtbmC/mk4CMYhWIicdSflH91J9TyCyczcPFXJzrZ/ZXcgGRFeP6BU0BEJTw6tZdfQ==",
"license": "MIT", "license": "MIT",
"engines": { "engines": {
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/react-dom": { "node_modules/react-dom": {
"version": "19.1.0", "version": "19.1.1",
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.0.tgz", "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.1.tgz",
"integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==", "integrity": "sha512-Dlq/5LAZgF0Gaz6yiqZCf6VCcZs1ghAJyrsu84Q/GT0gV+mCxbfmKNoGRKBYMJ8IEdGPqu49YWXD02GCknEDkw==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"scheduler": "^0.26.0" "scheduler": "^0.26.0"
}, },
"peerDependencies": { "peerDependencies": {
"react": "^19.1.0" "react": "^19.1.1"
} }
}, },
"node_modules/react-is": { "node_modules/react-is": {
@ -13285,9 +13430,9 @@
} }
}, },
"node_modules/sonner": { "node_modules/sonner": {
"version": "2.0.6", "version": "2.0.7",
"resolved": "https://registry.npmjs.org/sonner/-/sonner-2.0.6.tgz", "resolved": "https://registry.npmjs.org/sonner/-/sonner-2.0.7.tgz",
"integrity": "sha512-yHFhk8T/DK3YxjFQXIrcHT1rGEeTLliVzWbO0xN8GberVun2RiBnxAjXAYpZrqwEVHBG9asI/Li8TAAhN9m59Q==", "integrity": "sha512-W6ZN4p58k8aDKA4XPcx2hpIQXBRAgyiWVkYhT7CvK6D3iAu7xjvVyhQHg2/iaKJZ1XVJ4r7XuwGL+WGEK37i9w==",
"license": "MIT", "license": "MIT",
"peerDependencies": { "peerDependencies": {
"react": "^18.0.0 || ^19.0.0 || ^19.0.0-rc", "react": "^18.0.0 || ^19.0.0 || ^19.0.0-rc",
@ -13712,9 +13857,9 @@
} }
}, },
"node_modules/tailwindcss": { "node_modules/tailwindcss": {
"version": "4.1.6", "version": "4.1.13",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.6.tgz", "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.13.tgz",
"integrity": "sha512-j0cGLTreM6u4OWzBeLBpycK0WIh8w7kSwcUsQZoGLHZ7xDTdM69lN64AgoIEEwFi0tnhs4wSykUa5YWxAzgFYg==", "integrity": "sha512-i+zidfmTqtwquj4hMEwdjshYYgMbOrPzb9a0M3ZgNa0JMoZeFC6bxZvO8yr8ozS6ix2SDz0+mvryPeBs2TFE+w==",
"dev": true, "dev": true,
"license": "MIT" "license": "MIT"
}, },

View file

@ -15,7 +15,7 @@
"dependencies": { "dependencies": {
"@radix-ui/react-collapsible": "^1.1.12", "@radix-ui/react-collapsible": "^1.1.12",
"@radix-ui/react-dialog": "^1.1.13", "@radix-ui/react-dialog": "^1.1.13",
"@radix-ui/react-dropdown-menu": "^2.1.14", "@radix-ui/react-dropdown-menu": "^2.1.16",
"@radix-ui/react-select": "^2.2.5", "@radix-ui/react-select": "^2.2.5",
"@radix-ui/react-separator": "^1.1.7", "@radix-ui/react-separator": "^1.1.7",
"@radix-ui/react-slot": "^1.2.3", "@radix-ui/react-slot": "^1.2.3",
@ -23,18 +23,18 @@
"class-variance-authority": "^0.7.1", "class-variance-authority": "^0.7.1",
"clsx": "^2.1.1", "clsx": "^2.1.1",
"framer-motion": "^12.23.12", "framer-motion": "^12.23.12",
"llama-stack-client": "^0.2.20", "llama-stack-client": "^0.2.21",
"lucide-react": "^0.510.0", "lucide-react": "^0.542.0",
"next": "15.3.3", "next": "15.3.3",
"next-auth": "^4.24.11", "next-auth": "^4.24.11",
"next-themes": "^0.4.6", "next-themes": "^0.4.6",
"react": "^19.0.0", "react": "^19.0.0",
"react-dom": "^19.0.0", "react-dom": "^19.1.1",
"react-markdown": "^10.1.0", "react-markdown": "^10.1.0",
"remark-gfm": "^4.0.1", "remark-gfm": "^4.0.1",
"remeda": "^2.30.0", "remeda": "^2.30.0",
"shiki": "^1.29.2", "shiki": "^1.29.2",
"sonner": "^2.0.6", "sonner": "^2.0.7",
"tailwind-merge": "^3.3.1" "tailwind-merge": "^3.3.1"
}, },
"devDependencies": { "devDependencies": {

Some files were not shown because too many files have changed in this diff Show more