Merge branch 'main' into chroma

This commit is contained in:
Bwook (Byoungwook) Kim 2025-10-22 12:44:43 +09:00 committed by GitHub
commit 470adfc2df
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
750 changed files with 243399 additions and 28283 deletions

19
.dockerignore Normal file
View file

@ -0,0 +1,19 @@
.venv
__pycache__
*.pyc
*.pyo
*.pyd
*.so
.git
.gitignore
htmlcov*
.coverage
coverage*
.cache
.mypy_cache
.pytest_cache
.ruff_cache
uv.lock
node_modules
build
/tmp

1
.gitattributes vendored Normal file
View file

@ -0,0 +1 @@
tests/**/recordings/** linguist-generated=true

View file

@ -82,11 +82,13 @@ runs:
echo "No recording changes" echo "No recording changes"
fi fi
- name: Write inference logs to file - name: Write docker logs to file
if: ${{ always() }} if: ${{ always() }}
shell: bash shell: bash
run: | run: |
sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true # Ollama logs (if ollama container exists)
sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
# Note: distro container logs are now dumped in integration-tests.sh before container is removed
- name: Upload logs - name: Upload logs
if: ${{ always() }} if: ${{ always() }}

View file

@ -57,7 +57,7 @@ runs:
echo "Building Llama Stack" echo "Building Llama Stack"
LLAMA_STACK_DIR=. \ LLAMA_STACK_DIR=. \
uv run --no-sync llama stack build --template ci-tests --image-type venv uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
- name: Configure git for commits - name: Configure git for commits
shell: bash shell: bash

View file

@ -14,6 +14,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
| Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks | | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
| Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR | | Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
| Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build | | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
| Test llama stack list-deps | [providers-list-deps.yml](providers-list-deps.yml) | Test llama stack list-deps |
| Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project | | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
| Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration | | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
| Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec | | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |

View file

@ -30,8 +30,11 @@ jobs:
- name: Build a single provider - name: Build a single provider
run: | run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync \ docker build . \
llama stack build --template starter --image-type container --image-name test -f containers/Containerfile \
--build-arg INSTALL_MODE=editable \
--build-arg DISTRO_NAME=starter \
--tag llama-stack:starter-ci
- name: Run installer end-to-end - name: Run installer end-to-end
run: | run: |

View file

@ -73,6 +73,24 @@ jobs:
image_name: kube image_name: kube
apis: [] apis: []
providers: {} providers: {}
storage:
backends:
kv_default:
type: kv_sqlite
db_path: $run_dir/kvstore.db
sql_default:
type: sql_sqlite
db_path: $run_dir/sql_store.db
stores:
metadata:
namespace: registry
backend: kv_default
inference:
table_name: inference_store
backend: sql_default
conversations:
table_name: openai_conversations
backend: sql_default
server: server:
port: 8321 port: 8321
EOF EOF

View file

@ -47,7 +47,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
client-type: [library, server] client-type: [library, server, docker]
# Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12 # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }} python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }} client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
@ -82,7 +82,7 @@ jobs:
env: env:
OPENAI_API_KEY: dummy OPENAI_API_KEY: dummy
with: with:
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }} stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || matrix.client-type == 'server' && 'server:ci-tests' || 'docker:ci-tests' }}
setup: ${{ matrix.config.setup }} setup: ${{ matrix.config.setup }}
inference-mode: 'replay' inference-mode: 'replay'
suite: ${{ matrix.config.suite }} suite: ${{ matrix.config.suite }}

View file

@ -144,7 +144,7 @@ jobs:
- name: Build Llama Stack - name: Build Llama Stack
run: | run: |
uv run --no-sync llama stack build --template ci-tests --image-type venv uv run --no-sync llama stack list-deps ci-tests | xargs -L1 uv pip install
- name: Check Storage and Memory Available Before Tests - name: Check Storage and Memory Available Before Tests
if: ${{ always() }} if: ${{ always() }}
@ -169,9 +169,7 @@ jobs:
run: | run: |
uv run --no-sync \ uv run --no-sync \
pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \ pytest -sv --stack-config="files=inline::localfs,inference=inline::sentence-transformers,vector_io=${{ matrix.vector-io-provider }}" \
tests/integration/vector_io \ tests/integration/vector_io
--embedding-model nomic-ai/nomic-embed-text-v1.5 \
--embedding-dimension 768
- name: Check Storage and Memory Available After Tests - name: Check Storage and Memory Available After Tests
if: ${{ always() }} if: ${{ always() }}

View file

@ -37,7 +37,7 @@ jobs:
.pre-commit-config.yaml .pre-commit-config.yaml
- name: Set up Node.js - name: Set up Node.js
uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0 uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
with: with:
node-version: '20' node-version: '20'
cache: 'npm' cache: 'npm'

View file

@ -99,7 +99,7 @@ jobs:
owner: context.repo.owner, owner: context.repo.owner,
repo: context.repo.repo, repo: context.repo.repo,
issue_number: ${{ steps.check_author.outputs.pr_number }}, issue_number: ${{ steps.check_author.outputs.pr_number }},
body: `⏳ Running pre-commit hooks on PR #${{ steps.check_author.outputs.pr_number }}...` body: `⏳ Running [pre-commit hooks](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) on PR #${{ steps.check_author.outputs.pr_number }}...`
}); });
- name: Checkout PR branch (same-repo) - name: Checkout PR branch (same-repo)
@ -141,7 +141,7 @@ jobs:
- name: Set up Node.js - name: Set up Node.js
if: steps.check_author.outputs.authorized == 'true' if: steps.check_author.outputs.authorized == 'true'
uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0 uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
with: with:
node-version: '20' node-version: '20'
cache: 'npm' cache: 'npm'

View file

@ -14,6 +14,8 @@ on:
- '.github/workflows/providers-build.yml' - '.github/workflows/providers-build.yml'
- 'llama_stack/distributions/**' - 'llama_stack/distributions/**'
- 'pyproject.toml' - 'pyproject.toml'
- 'containers/Containerfile'
- '.dockerignore'
pull_request: pull_request:
paths: paths:
@ -24,6 +26,8 @@ on:
- '.github/workflows/providers-build.yml' - '.github/workflows/providers-build.yml'
- 'llama_stack/distributions/**' - 'llama_stack/distributions/**'
- 'pyproject.toml' - 'pyproject.toml'
- 'containers/Containerfile'
- '.dockerignore'
concurrency: concurrency:
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }} group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
@ -60,15 +64,19 @@ jobs:
- name: Install dependencies - name: Install dependencies
uses: ./.github/actions/setup-runner uses: ./.github/actions/setup-runner
- name: Print build dependencies - name: Install distribution into venv
if: matrix.image-type == 'venv'
run: | run: |
uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
- name: Run Llama Stack Build - name: Build container image
if: matrix.image-type == 'container'
run: | run: |
# USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead docker build . \
# LLAMA_STACK_DIR is set to the current directory so we are building from the source -f containers/Containerfile \
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --distro ${{ matrix.distro }} --image-type ${{ matrix.image-type }} --image-name test --build-arg INSTALL_MODE=editable \
--build-arg DISTRO_NAME=${{ matrix.distro }} \
--tag llama-stack:${{ matrix.distro }}-ci
- name: Print dependencies in the image - name: Print dependencies in the image
if: matrix.image-type == 'venv' if: matrix.image-type == 'venv'
@ -86,8 +94,8 @@ jobs:
- name: Build a single provider - name: Build a single provider
run: | run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama uv pip install -e .
uv run --no-sync llama stack list-deps --providers inference=remote::ollama | xargs -L1 uv pip install
build-custom-container-distribution: build-custom-container-distribution:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
@ -97,11 +105,16 @@ jobs:
- name: Install dependencies - name: Install dependencies
uses: ./.github/actions/setup-runner uses: ./.github/actions/setup-runner
- name: Build a single provider - name: Build container image
run: | run: |
yq -i '.image_type = "container"' llama_stack/distributions/ci-tests/build.yaml BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "python:3.12-slim"' llama_stack/distributions/ci-tests/build.yaml)
yq -i '.image_name = "test"' llama_stack/distributions/ci-tests/build.yaml docker build . \
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml -f containers/Containerfile \
--build-arg INSTALL_MODE=editable \
--build-arg DISTRO_NAME=ci-tests \
--build-arg BASE_IMAGE="$BASE_IMAGE" \
--build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
-t llama-stack:ci-tests
- name: Inspect the container image entrypoint - name: Inspect the container image entrypoint
run: | run: |
@ -112,7 +125,7 @@ jobs:
fi fi
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID) entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint" echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
echo "Entrypoint is not correct" echo "Entrypoint is not correct"
exit 1 exit 1
fi fi
@ -129,17 +142,19 @@ jobs:
- name: Pin distribution to UBI9 base - name: Pin distribution to UBI9 base
run: | run: |
yq -i ' yq -i '
.image_type = "container" |
.image_name = "ubi9-test" |
.distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest" .distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
' llama_stack/distributions/ci-tests/build.yaml ' llama_stack/distributions/ci-tests/build.yaml
- name: Build dev container (UBI9) - name: Build UBI9 container image
env:
USE_COPY_NOT_MOUNT: "true"
LLAMA_STACK_DIR: "."
run: | run: |
uv run llama stack build --config llama_stack/distributions/ci-tests/build.yaml BASE_IMAGE=$(yq -r '.distribution_spec.container_image // "registry.access.redhat.com/ubi9:latest"' llama_stack/distributions/ci-tests/build.yaml)
docker build . \
-f containers/Containerfile \
--build-arg INSTALL_MODE=editable \
--build-arg DISTRO_NAME=ci-tests \
--build-arg BASE_IMAGE="$BASE_IMAGE" \
--build-arg RUN_CONFIG_PATH=/workspace/llama_stack/distributions/ci-tests/run.yaml \
-t llama-stack:ci-tests-ubi9
- name: Inspect UBI9 image - name: Inspect UBI9 image
run: | run: |
@ -150,7 +165,7 @@ jobs:
fi fi
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID) entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint" echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then if [ "$entrypoint" != "[/usr/local/bin/llama-stack-entrypoint.sh]" ]; then
echo "Entrypoint is not correct" echo "Entrypoint is not correct"
exit 1 exit 1
fi fi

View file

@ -0,0 +1,105 @@
name: Test llama stack list-deps
run-name: Test llama stack list-deps
on:
push:
branches:
- main
paths:
- 'llama_stack/cli/stack/list_deps.py'
- 'llama_stack/cli/stack/_list_deps.py'
- 'llama_stack/core/build.*'
- 'llama_stack/core/*.sh'
- '.github/workflows/providers-list-deps.yml'
- 'llama_stack/templates/**'
- 'pyproject.toml'
pull_request:
paths:
- 'llama_stack/cli/stack/list_deps.py'
- 'llama_stack/cli/stack/_list_deps.py'
- 'llama_stack/core/build.*'
- 'llama_stack/core/*.sh'
- '.github/workflows/providers-list-deps.yml'
- 'llama_stack/templates/**'
- 'pyproject.toml'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
generate-matrix:
runs-on: ubuntu-latest
outputs:
distros: ${{ steps.set-matrix.outputs.distros }}
steps:
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Generate Distribution List
id: set-matrix
run: |
distros=$(ls llama_stack/distributions/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
echo "distros=$distros" >> "$GITHUB_OUTPUT"
list-deps:
needs: generate-matrix
runs-on: ubuntu-latest
strategy:
matrix:
distro: ${{ fromJson(needs.generate-matrix.outputs.distros) }}
image-type: [venv, container]
fail-fast: false # We want to run all jobs even if some fail
steps:
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Print dependencies
run: |
uv run llama stack list-deps ${{ matrix.distro }}
- name: Install Distro using llama stack list-deps
run: |
# USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
# LLAMA_STACK_DIR is set to the current directory so we are building from the source
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps ${{ matrix.distro }} | xargs -L1 uv pip install
- name: Print dependencies in the image
if: matrix.image-type == 'venv'
run: |
uv pip list
show-single-provider:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Show a single provider
run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack list-deps --providers inference=remote::ollama
list-deps-from-config:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: list-des from Config
env:
USE_COPY_NOT_MOUNT: "true"
LLAMA_STACK_DIR: "."
run: |
uv run llama stack list-deps llama_stack/distributions/ci-tests/build.yaml

View file

@ -24,7 +24,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@eb1897b8dc4b5d5bfe39a428a8f2304605e0983c # v7.0.0 uses: astral-sh/setup-uv@3259c6206f993105e3a61b142c2d97bf4b9ef83d # v7.1.0
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
activate-environment: true activate-environment: true

View file

@ -46,9 +46,9 @@ jobs:
yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml yq -i '.image_type = "${{ matrix.image-type }}"' tests/external/ramalama-stack/run.yaml
cat tests/external/ramalama-stack/run.yaml cat tests/external/ramalama-stack/run.yaml
- name: Build distro from config file - name: Install distribution dependencies
run: | run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external/ramalama-stack/build.yaml uv run llama stack list-deps tests/external/ramalama-stack/build.yaml | xargs -L1 uv pip install
- name: Start Llama Stack server in background - name: Start Llama Stack server in background
if: ${{ matrix.image-type }} == 'venv' if: ${{ matrix.image-type }} == 'venv'

View file

@ -44,11 +44,14 @@ jobs:
- name: Print distro dependencies - name: Print distro dependencies
run: | run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml --print-deps-only uv run --no-sync llama stack list-deps tests/external/build.yaml
- name: Build distro from config file - name: Build distro from config file
run: | run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run --no-sync llama stack build --config tests/external/build.yaml uv venv ci-test
source ci-test/bin/activate
uv pip install -e .
LLAMA_STACK_LOGGING=all=CRITICAL llama stack list-deps tests/external/build.yaml | xargs -L1 uv pip install
- name: Start Llama Stack server in background - name: Start Llama Stack server in background
if: ${{ matrix.image-type }} == 'venv' if: ${{ matrix.image-type }} == 'venv'

View file

@ -29,7 +29,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Setup Node.js - name: Setup Node.js
uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0 uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
with: with:
node-version: ${{ matrix.node-version }} node-version: ${{ matrix.node-version }}
cache: 'npm' cache: 'npm'

View file

@ -11,14 +11,17 @@ You can install the dependencies by running:
```bash ```bash
cd llama-stack cd llama-stack
uv venv --python 3.12
uv sync --group dev uv sync --group dev
uv pip install -e . uv pip install -e .
source .venv/bin/activate source .venv/bin/activate
``` ```
```{note} ```{note}
You can use a specific version of Python with `uv` by adding the `--python <version>` flag (e.g. `--python 3.12`). If you are making changes to Llama Stack, it is essential that you use Python 3.12 as shown above.
Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`. Llama Stack can work with Python 3.13 but the pre-commit hooks used to validate code changes only work with Python 3.12.
If you don't specify a Python version, `uv` will automatically select a Python version according to the `requires-python`
section of the `pyproject.toml`, which is fine for running Llama Stack but not for committing changes.
For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/). For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
``` ```
@ -42,17 +45,22 @@ uv run --env-file .env -- pytest -v tests/integration/inference/test_text_infere
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running: We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
```bash ```bash
uv pip install pre-commit==4.3.0
uv run pre-commit install uv run pre-commit install
``` ```
After that, pre-commit hooks will run automatically before each commit. Note that the only version of pre-commit that works with the Llama Stack continuous integration is `4.3.0` so it is essential that you pull
that specific version as shown above. Once you have run these commands, pre-commit hooks will run automatically before each commit.
Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running: Alternatively, if you don't want to install the pre-commit hooks (or if you want to check if your changes are ready before committing),
you can run the checks manually by running:
```bash ```bash
uv run pre-commit run --all-files uv run pre-commit run --all-files -v
``` ```
The `-v` (verbose) parameter is optional but often helpful for getting more information about any issues with that the pre-commit checks identify.
```{caution} ```{caution}
Before pushing your changes, make sure that the pre-commit hooks have passed successfully. Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
``` ```
@ -83,6 +91,7 @@ If you are new to the project, start by looking at the issues tagged with "good
leave a comment on the issue and a triager will assign it to you. leave a comment on the issue and a triager will assign it to you.
Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute. Please avoid picking up too many issues at once. This helps you stay focused and ensures that others in the community also have opportunities to contribute.
- Try to work on only 12 issues at a time, especially if youre still getting familiar with the codebase. - Try to work on only 12 issues at a time, especially if youre still getting familiar with the codebase.
- Before taking an issue, check if its already assigned or being actively discussed. - Before taking an issue, check if its already assigned or being actively discussed.
- If youre blocked or cant continue with an issue, feel free to unassign yourself or leave a comment so others can step in. - If youre blocked or cant continue with an issue, feel free to unassign yourself or leave a comment so others can step in.
@ -158,9 +167,9 @@ under the LICENSE file in the root directory of this source tree.
Some tips about common tasks you work on while contributing to Llama Stack: Some tips about common tasks you work on while contributing to Llama Stack:
### Using `llama stack build` ### Installing dependencies of distributions
Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands. When installing dependencies for a distribution, you can use `llama stack list-deps` to view and install the required packages.
Example: Example:
```bash ```bash
@ -168,7 +177,12 @@ cd work/
git clone https://github.com/llamastack/llama-stack.git git clone https://github.com/llamastack/llama-stack.git
git clone https://github.com/llamastack/llama-stack-client-python.git git clone https://github.com/llamastack/llama-stack-client-python.git
cd llama-stack cd llama-stack
LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...>
# Show dependencies for a distribution
llama stack list-deps <distro-name>
# Install dependencies
llama stack list-deps <distro-name> | xargs -L1 uv pip install
``` ```
### Updating distribution configurations ### Updating distribution configurations
@ -191,6 +205,7 @@ If you are making changes to the documentation at [https://llamastack.github.io/
```bash ```bash
# This rebuilds the documentation pages and the OpenAPI spec. # This rebuilds the documentation pages and the OpenAPI spec.
cd docs/
npm install npm install
npm run gen-api-docs all npm run gen-api-docs all
npm run build npm run build

View file

@ -27,8 +27,11 @@ MODEL="Llama-4-Scout-17B-16E-Instruct"
# get meta url from llama.com # get meta url from llama.com
huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
# install dependencies for the distribution
llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
# start a llama stack server # start a llama stack server
INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
# install client to interact with the server # install client to interact with the server
pip install llama-stack-client pip install llama-stack-client
@ -89,7 +92,7 @@ As more providers start supporting Llama 4, you can use them in Llama Stack as w
To try Llama Stack locally, run: To try Llama Stack locally, run:
```bash ```bash
curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash curl -LsSf https://github.com/llamastack/llama-stack/raw/main/scripts/install.sh | bash
``` ```
### Overview ### Overview

View file

@ -98,21 +98,30 @@ data:
- provider_id: model-context-protocol - provider_id: model-context-protocol
provider_type: remote::model-context-protocol provider_type: remote::model-context-protocol
config: {} config: {}
metadata_store: storage:
type: postgres backends:
host: ${env.POSTGRES_HOST:=localhost} kv_default:
port: ${env.POSTGRES_PORT:=5432} type: kv_postgres
db: ${env.POSTGRES_DB:=llamastack} host: ${env.POSTGRES_HOST:=localhost}
user: ${env.POSTGRES_USER:=llamastack} port: ${env.POSTGRES_PORT:=5432}
password: ${env.POSTGRES_PASSWORD:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
table_name: llamastack_kvstore user: ${env.POSTGRES_USER:=llamastack}
inference_store: password: ${env.POSTGRES_PASSWORD:=llamastack}
type: postgres table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
host: ${env.POSTGRES_HOST:=localhost} sql_default:
port: ${env.POSTGRES_PORT:=5432} type: sql_postgres
db: ${env.POSTGRES_DB:=llamastack} host: ${env.POSTGRES_HOST:=localhost}
user: ${env.POSTGRES_USER:=llamastack} port: ${env.POSTGRES_PORT:=5432}
password: ${env.POSTGRES_PASSWORD:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
references:
metadata:
backend: kv_default
namespace: registry
inference:
backend: sql_default
table_name: inference_store
models: models:
- metadata: - metadata:
embedding_dimension: 768 embedding_dimension: 768
@ -137,5 +146,4 @@ data:
port: 8323 port: 8323
kind: ConfigMap kind: ConfigMap
metadata: metadata:
creationTimestamp: null
name: llama-stack-config name: llama-stack-config

View file

@ -95,21 +95,30 @@ providers:
- provider_id: model-context-protocol - provider_id: model-context-protocol
provider_type: remote::model-context-protocol provider_type: remote::model-context-protocol
config: {} config: {}
metadata_store: storage:
type: postgres backends:
host: ${env.POSTGRES_HOST:=localhost} kv_default:
port: ${env.POSTGRES_PORT:=5432} type: kv_postgres
db: ${env.POSTGRES_DB:=llamastack} host: ${env.POSTGRES_HOST:=localhost}
user: ${env.POSTGRES_USER:=llamastack} port: ${env.POSTGRES_PORT:=5432}
password: ${env.POSTGRES_PASSWORD:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
table_name: llamastack_kvstore user: ${env.POSTGRES_USER:=llamastack}
inference_store: password: ${env.POSTGRES_PASSWORD:=llamastack}
type: postgres table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
host: ${env.POSTGRES_HOST:=localhost} sql_default:
port: ${env.POSTGRES_PORT:=5432} type: sql_postgres
db: ${env.POSTGRES_DB:=llamastack} host: ${env.POSTGRES_HOST:=localhost}
user: ${env.POSTGRES_USER:=llamastack} port: ${env.POSTGRES_PORT:=5432}
password: ${env.POSTGRES_PASSWORD:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
references:
metadata:
backend: kv_default
namespace: registry
inference:
backend: sql_default
table_name: inference_store
models: models:
- metadata: - metadata:
embedding_dimension: 768 embedding_dimension: 768

View file

@ -0,0 +1,8 @@
These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless.
- `openapi.yml`: this is the OpenAPI specification for the Llama Stack API.
- `openapi.stainless.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.
These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.

View file

@ -0,0 +1,610 @@
# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
organization:
# Name of your organization or company, used to determine the name of the client
# and headings.
name: llama-stack-client
docs: https://llama-stack.readthedocs.io/en/latest/
contact: llamastack@meta.com
security:
- {}
- BearerAuth: []
security_schemes:
BearerAuth:
type: http
scheme: bearer
# `targets` define the output targets and their customization options, such as
# whether to emit the Node SDK and what it's package name should be.
targets:
node:
package_name: llama-stack-client
production_repo: llamastack/llama-stack-client-typescript
publish:
npm: false
python:
package_name: llama_stack_client
production_repo: llamastack/llama-stack-client-python
options:
use_uv: true
publish:
pypi: true
project_name: llama_stack_client
kotlin:
reverse_domain: com.llama_stack_client.api
production_repo: null
publish:
maven: false
go:
package_name: llama-stack-client
production_repo: llamastack/llama-stack-client-go
options:
enable_v2: true
back_compat_use_shared_package: false
# `client_settings` define settings for the API client, such as extra constructor
# arguments (used for authentication), retry behavior, idempotency, etc.
client_settings:
default_env_prefix: LLAMA_STACK_CLIENT
opts:
api_key:
type: string
read_env: LLAMA_STACK_CLIENT_API_KEY
auth: { security_scheme: BearerAuth }
nullable: true
# `environments` are a map of the name of the environment (e.g. "sandbox",
# "production") to the corresponding url to use.
environments:
production: http://any-hosted-llama-stack.com
# `pagination` defines [pagination schemes] which provides a template to match
# endpoints and generate next-page and auto-pagination helpers in the SDKs.
pagination:
- name: datasets_iterrows
type: offset
request:
dataset_id:
type: string
start_index:
type: integer
x-stainless-pagination-property:
purpose: offset_count_param
limit:
type: integer
response:
data:
type: array
items:
type: object
next_index:
type: integer
x-stainless-pagination-property:
purpose: offset_count_start_field
- name: openai_cursor_page
type: cursor
request:
limit:
type: integer
after:
type: string
x-stainless-pagination-property:
purpose: next_cursor_param
response:
data:
type: array
items: {}
has_more:
type: boolean
last_id:
type: string
x-stainless-pagination-property:
purpose: next_cursor_field
# `resources` define the structure and organziation for your API, such as how
# methods and models are grouped together and accessed. See the [configuration
# guide] for more information.
#
# [configuration guide]:
# https://app.stainlessapi.com/docs/guides/configure#resources
resources:
$shared:
models:
agent_config: AgentConfig
interleaved_content_item: InterleavedContentItem
interleaved_content: InterleavedContent
param_type: ParamType
safety_violation: SafetyViolation
sampling_params: SamplingParams
scoring_result: ScoringResult
message: Message
user_message: UserMessage
completion_message: CompletionMessage
tool_response_message: ToolResponseMessage
system_message: SystemMessage
tool_call: ToolCall
query_result: RAGQueryResult
document: RAGDocument
query_config: RAGQueryConfig
response_format: ResponseFormat
toolgroups:
models:
tool_group: ToolGroup
list_tool_groups_response: ListToolGroupsResponse
methods:
register: post /v1/toolgroups
get: get /v1/toolgroups/{toolgroup_id}
list: get /v1/toolgroups
unregister: delete /v1/toolgroups/{toolgroup_id}
tools:
methods:
get: get /v1/tools/{tool_name}
list:
endpoint: get /v1/tools
paginated: false
tool_runtime:
models:
tool_def: ToolDef
tool_invocation_result: ToolInvocationResult
methods:
list_tools:
endpoint: get /v1/tool-runtime/list-tools
paginated: false
invoke_tool: post /v1/tool-runtime/invoke
subresources:
rag_tool:
methods:
insert: post /v1/tool-runtime/rag-tool/insert
query: post /v1/tool-runtime/rag-tool/query
responses:
models:
response_object_stream: OpenAIResponseObjectStream
response_object: OpenAIResponseObject
methods:
create:
type: http
endpoint: post /v1/responses
streaming:
stream_event_model: responses.response_object_stream
param_discriminator: stream
retrieve: get /v1/responses/{response_id}
list:
type: http
endpoint: get /v1/responses
delete:
type: http
endpoint: delete /v1/responses/{response_id}
subresources:
input_items:
methods:
list:
type: http
endpoint: get /v1/responses/{response_id}/input_items
conversations:
models:
conversation_object: Conversation
methods:
create:
type: http
endpoint: post /v1/conversations
retrieve: get /v1/conversations/{conversation_id}
update:
type: http
endpoint: post /v1/conversations/{conversation_id}
delete:
type: http
endpoint: delete /v1/conversations/{conversation_id}
subresources:
items:
methods:
get:
type: http
endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
list:
type: http
endpoint: get /v1/conversations/{conversation_id}/items
create:
type: http
endpoint: post /v1/conversations/{conversation_id}/items
inspect:
models:
healthInfo: HealthInfo
providerInfo: ProviderInfo
routeInfo: RouteInfo
versionInfo: VersionInfo
methods:
health: get /v1/health
version: get /v1/version
embeddings:
models:
create_embeddings_response: OpenAIEmbeddingsResponse
methods:
create: post /v1/embeddings
chat:
models:
chat_completion_chunk: OpenAIChatCompletionChunk
subresources:
completions:
methods:
create:
type: http
endpoint: post /v1/chat/completions
streaming:
stream_event_model: chat.chat_completion_chunk
param_discriminator: stream
list:
type: http
endpoint: get /v1/chat/completions
retrieve:
type: http
endpoint: get /v1/chat/completions/{completion_id}
completions:
methods:
create:
type: http
endpoint: post /v1/completions
streaming:
param_discriminator: stream
vector_io:
models:
queryChunksResponse: QueryChunksResponse
methods:
insert: post /v1/vector-io/insert
query: post /v1/vector-io/query
vector_stores:
models:
vector_store: VectorStoreObject
list_vector_stores_response: VectorStoreListResponse
vector_store_delete_response: VectorStoreDeleteResponse
vector_store_search_response: VectorStoreSearchResponsePage
methods:
create: post /v1/vector_stores
list:
endpoint: get /v1/vector_stores
retrieve: get /v1/vector_stores/{vector_store_id}
update: post /v1/vector_stores/{vector_store_id}
delete: delete /v1/vector_stores/{vector_store_id}
search: post /v1/vector_stores/{vector_store_id}/search
subresources:
files:
models:
vector_store_file: VectorStoreFileObject
methods:
list: get /v1/vector_stores/{vector_store_id}/files
retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
create: post /v1/vector_stores/{vector_store_id}/files
content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
file_batches:
models:
vector_store_file_batches: VectorStoreFileBatchObject
list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
methods:
create: post /v1/vector_stores/{vector_store_id}/file_batches
retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
models:
models:
model: Model
list_models_response: ListModelsResponse
methods:
retrieve: get /v1/models/{model_id}
list:
endpoint: get /v1/models
paginated: false
register: post /v1/models
unregister: delete /v1/models/{model_id}
subresources:
openai:
methods:
list:
endpoint: get /v1/models
paginated: false
providers:
models:
list_providers_response: ListProvidersResponse
methods:
list:
endpoint: get /v1/providers
paginated: false
retrieve: get /v1/providers/{provider_id}
routes:
models:
list_routes_response: ListRoutesResponse
methods:
list:
endpoint: get /v1/inspect/routes
paginated: false
moderations:
models:
create_response: ModerationObject
methods:
create: post /v1/moderations
safety:
models:
run_shield_response: RunShieldResponse
methods:
run_shield: post /v1/safety/run-shield
shields:
models:
shield: Shield
list_shields_response: ListShieldsResponse
methods:
retrieve: get /v1/shields/{identifier}
list:
endpoint: get /v1/shields
paginated: false
register: post /v1/shields
delete: delete /v1/shields/{identifier}
synthetic_data_generation:
models:
syntheticDataGenerationResponse: SyntheticDataGenerationResponse
methods:
generate: post /v1/synthetic-data-generation/generate
telemetry:
models:
span_with_status: SpanWithStatus
trace: Trace
query_spans_response: QuerySpansResponse
event: Event
query_condition: QueryCondition
methods:
query_traces:
endpoint: post /v1alpha/telemetry/traces
skip_test_reason: 'unsupported query params in java / kotlin'
get_span_tree: post /v1alpha/telemetry/spans/{span_id}/tree
query_spans:
endpoint: post /v1alpha/telemetry/spans
skip_test_reason: 'unsupported query params in java / kotlin'
query_metrics:
endpoint: post /v1alpha/telemetry/metrics/{metric_name}
skip_test_reason: 'unsupported query params in java / kotlin'
# log_event: post /v1alpha/telemetry/events
save_spans_to_dataset: post /v1alpha/telemetry/spans/export
get_span: get /v1alpha/telemetry/traces/{trace_id}/spans/{span_id}
get_trace: get /v1alpha/telemetry/traces/{trace_id}
scoring:
methods:
score: post /v1/scoring/score
score_batch: post /v1/scoring/score-batch
scoring_functions:
methods:
retrieve: get /v1/scoring-functions/{scoring_fn_id}
list:
endpoint: get /v1/scoring-functions
paginated: false
register: post /v1/scoring-functions
models:
scoring_fn: ScoringFn
scoring_fn_params: ScoringFnParams
list_scoring_functions_response: ListScoringFunctionsResponse
benchmarks:
methods:
retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
list:
endpoint: get /v1alpha/eval/benchmarks
paginated: false
register: post /v1alpha/eval/benchmarks
models:
benchmark: Benchmark
list_benchmarks_response: ListBenchmarksResponse
files:
methods:
create: post /v1/files
list: get /v1/files
retrieve: get /v1/files/{file_id}
delete: delete /v1/files/{file_id}
content: get /v1/files/{file_id}/content
models:
file: OpenAIFileObject
list_files_response: ListOpenAIFileResponse
delete_file_response: OpenAIFileDeleteResponse
alpha:
subresources:
inference:
methods:
rerank: post /v1alpha/inference/rerank
post_training:
models:
algorithm_config: AlgorithmConfig
post_training_job: PostTrainingJob
list_post_training_jobs_response: ListPostTrainingJobsResponse
methods:
preference_optimize: post /v1alpha/post-training/preference-optimize
supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
subresources:
job:
methods:
artifacts: get /v1alpha/post-training/job/artifacts
cancel: post /v1alpha/post-training/job/cancel
status: get /v1alpha/post-training/job/status
list:
endpoint: get /v1alpha/post-training/jobs
paginated: false
eval:
methods:
evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
subresources:
jobs:
methods:
cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
models:
evaluate_response: EvaluateResponse
benchmark_config: BenchmarkConfig
job: Job
agents:
methods:
create: post /v1alpha/agents
list: get /v1alpha/agents
retrieve: get /v1alpha/agents/{agent_id}
delete: delete /v1alpha/agents/{agent_id}
models:
inference_step: InferenceStep
tool_execution_step: ToolExecutionStep
tool_response: ToolResponse
shield_call_step: ShieldCallStep
memory_retrieval_step: MemoryRetrievalStep
subresources:
session:
models:
session: Session
methods:
list: get /v1alpha/agents/{agent_id}/sessions
create: post /v1alpha/agents/{agent_id}/session
delete: delete /v1alpha/agents/{agent_id}/session/{session_id}
retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}
steps:
methods:
retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}
turn:
models:
turn: Turn
turn_response_event: AgentTurnResponseEvent
agent_turn_response_stream_chunk: AgentTurnResponseStreamChunk
methods:
create:
type: http
endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn
streaming:
stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
param_discriminator: stream
retrieve: get /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}
resume:
type: http
endpoint: post /v1alpha/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume
streaming:
stream_event_model: alpha.agents.turn.agent_turn_response_stream_chunk
param_discriminator: stream
beta:
subresources:
datasets:
models:
list_datasets_response: ListDatasetsResponse
methods:
register: post /v1beta/datasets
retrieve: get /v1beta/datasets/{dataset_id}
list:
endpoint: get /v1beta/datasets
paginated: false
unregister: delete /v1beta/datasets/{dataset_id}
iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
settings:
license: MIT
unwrap_response_fields: [ data ]
openapi:
transformations:
- command: renameValue
reason: pydantic reserved name
args:
filter:
only:
- '$.components.schemas.InferenceStep.properties.model_response'
rename:
python:
property_name: 'inference_model_response'
# - command: renameValue
# reason: pydantic reserved name
# args:
# filter:
# only:
# - '$.components.schemas.Model.properties.model_type'
# rename:
# python:
# property_name: 'type'
- command: mergeObject
reason: Better return_type using enum
args:
target:
- '$.components.schemas'
object:
ReturnType:
additionalProperties: false
properties:
type:
enum:
- string
- number
- boolean
- array
- object
- json
- union
- chat_completion_input
- completion_input
- agent_turn_input
required:
- type
type: object
- command: replaceProperties
reason: Replace return type properties with better model (see above)
args:
filter:
only:
- '$.components.schemas.ScoringFn.properties.return_type'
- '$.components.schemas.RegisterScoringFunctionRequest.properties.return_type'
value:
$ref: '#/components/schemas/ReturnType'
- command: oneOfToAnyOf
reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
- reason: For better names
command: extractToRefs
args:
ref:
target: '$.components.schemas.ToolCallDelta.properties.tool_call'
name: '#/components/schemas/ToolCallOrString'
# `readme` is used to configure the code snippets that will be rendered in the
# README.md of various SDKs. In particular, you can change the `headline`
# snippet's endpoint and the arguments to call it with.
readme:
example_requests:
default:
type: request
endpoint: post /v1/chat/completions
params: &ref_0 {}
headline:
type: request
endpoint: post /v1/models
params: *ref_0
pagination:
type: request
endpoint: post /v1/chat/completions
params: {}

File diff suppressed because it is too large Load diff

137
containers/Containerfile Normal file
View file

@ -0,0 +1,137 @@
# syntax=docker/dockerfile:1.6
#
# This Dockerfile is used to build the Llama Stack container image.
# Example:
# docker build \
# -f containers/Containerfile \
# --build-arg DISTRO_NAME=starter \
# --tag llama-stack:starter .
ARG BASE_IMAGE=python:3.12-slim
FROM ${BASE_IMAGE}
ARG INSTALL_MODE="pypi"
ARG LLAMA_STACK_DIR="/workspace"
ARG LLAMA_STACK_CLIENT_DIR=""
ARG PYPI_VERSION=""
ARG TEST_PYPI_VERSION=""
ARG KEEP_WORKSPACE=""
ARG DISTRO_NAME="starter"
ARG RUN_CONFIG_PATH=""
ARG UV_HTTP_TIMEOUT=500
ENV UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT}
ENV PYTHONDONTWRITEBYTECODE=1
ENV PIP_DISABLE_PIP_VERSION_CHECK=1
WORKDIR /app
RUN set -eux; \
if command -v dnf >/dev/null 2>&1; then \
dnf -y update && \
dnf install -y iputils git net-tools wget \
vim-minimal python3.12 python3.12-pip python3.12-wheel \
python3.12-setuptools python3.12-devel gcc gcc-c++ make && \
ln -sf /usr/bin/pip3.12 /usr/local/bin/pip && \
ln -sf /usr/bin/python3.12 /usr/local/bin/python && \
dnf clean all; \
elif command -v apt-get >/dev/null 2>&1; then \
apt-get update && \
apt-get install -y --no-install-recommends \
iputils-ping net-tools iproute2 dnsutils telnet \
curl wget git procps psmisc lsof traceroute bubblewrap \
gcc g++ && \
rm -rf /var/lib/apt/lists/*; \
else \
echo "Unsupported base image: expected dnf or apt-get" >&2; \
exit 1; \
fi
RUN pip install --no-cache-dir uv
ENV UV_SYSTEM_PYTHON=1
ENV INSTALL_MODE=${INSTALL_MODE}
ENV LLAMA_STACK_DIR=${LLAMA_STACK_DIR}
ENV LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR}
ENV PYPI_VERSION=${PYPI_VERSION}
ENV TEST_PYPI_VERSION=${TEST_PYPI_VERSION}
ENV KEEP_WORKSPACE=${KEEP_WORKSPACE}
ENV DISTRO_NAME=${DISTRO_NAME}
ENV RUN_CONFIG_PATH=${RUN_CONFIG_PATH}
# Copy the repository so editable installs and run configurations are available.
COPY . /workspace
# Install the client package if it is provided
# NOTE: this is installed before llama-stack since llama-stack depends on llama-stack-client-python
RUN set -eux; \
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then \
if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then \
echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
exit 1; \
fi; \
uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"; \
fi;
# Install llama-stack
RUN set -eux; \
if [ "$INSTALL_MODE" = "editable" ]; then \
if [ ! -d "$LLAMA_STACK_DIR" ]; then \
echo "INSTALL_MODE=editable requires LLAMA_STACK_DIR to point to a directory inside the build context" >&2; \
exit 1; \
fi; \
uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \
elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
uv pip install --no-cache-dir fastapi libcst; \
if [ -n "$TEST_PYPI_VERSION" ]; then \
uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
else \
uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
fi; \
else \
if [ -n "$PYPI_VERSION" ]; then \
uv pip install --no-cache-dir "llama-stack==$PYPI_VERSION"; \
else \
uv pip install --no-cache-dir llama-stack; \
fi; \
fi;
# Install the dependencies for the distribution
RUN set -eux; \
if [ -z "$DISTRO_NAME" ]; then \
echo "DISTRO_NAME must be provided" >&2; \
exit 1; \
fi; \
deps="$(llama stack list-deps "$DISTRO_NAME")"; \
if [ -n "$deps" ]; then \
printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache-dir; \
fi
# Cleanup
RUN set -eux; \
pip uninstall -y uv; \
should_remove=1; \
if [ -n "$KEEP_WORKSPACE" ]; then should_remove=0; fi; \
if [ "$INSTALL_MODE" = "editable" ]; then should_remove=0; fi; \
case "$RUN_CONFIG_PATH" in \
/workspace*) should_remove=0 ;; \
esac; \
if [ "$should_remove" -eq 1 ] && [ -d /workspace ]; then rm -rf /workspace; fi
RUN cat <<'EOF' >/usr/local/bin/llama-stack-entrypoint.sh
#!/bin/sh
set -e
if [ -n "$RUN_CONFIG_PATH" ] && [ -f "$RUN_CONFIG_PATH" ]; then
exec llama stack run "$RUN_CONFIG_PATH" "$@"
fi
if [ -n "$DISTRO_NAME" ]; then
exec llama stack run "$DISTRO_NAME" "$@"
fi
exec llama stack run "$@"
EOF
RUN chmod +x /usr/local/bin/llama-stack-entrypoint.sh
RUN mkdir -p /.llama /.cache && chmod -R g+rw /app /.llama /.cache
ENTRYPOINT ["/usr/local/bin/llama-stack-entrypoint.sh"]

View file

@ -51,8 +51,8 @@ device: cpu
You can access the HuggingFace trainer via the `starter` distribution: You can access the HuggingFace trainer via the `starter` distribution:
```bash ```bash
llama stack build --distro starter --image-type venv llama stack list-deps starter | xargs -L1 uv pip install
llama stack run ~/.llama/distributions/starter/starter-run.yaml llama stack run starter
``` ```
### Usage Example ### Usage Example

View file

@ -175,8 +175,7 @@ llama-stack-client benchmarks register \
**1. Start the Llama Stack API Server** **1. Start the Llama Stack API Server**
```bash ```bash
# Build and run a distribution (example: together) llama stack list-deps together | xargs -L1 uv pip install
llama stack build --distro together --image-type venv
llama stack run together llama stack run together
``` ```
@ -209,7 +208,7 @@ The playground works with any Llama Stack distribution. Popular options include:
<TabItem value="together" label="Together AI"> <TabItem value="together" label="Together AI">
```bash ```bash
llama stack build --distro together --image-type venv llama stack list-deps together | xargs -L1 uv pip install
llama stack run together llama stack run together
``` ```
@ -222,7 +221,7 @@ llama stack run together
<TabItem value="ollama" label="Ollama (Local)"> <TabItem value="ollama" label="Ollama (Local)">
```bash ```bash
llama stack build --distro ollama --image-type venv llama stack list-deps ollama | xargs -L1 uv pip install
llama stack run ollama llama stack run ollama
``` ```
@ -235,7 +234,7 @@ llama stack run ollama
<TabItem value="meta-reference" label="Meta Reference"> <TabItem value="meta-reference" label="Meta Reference">
```bash ```bash
llama stack build --distro meta-reference --image-type venv llama stack list-deps meta-reference | xargs -L1 uv pip install
llama stack run meta-reference llama stack run meta-reference
``` ```

View file

@ -10,358 +10,114 @@ import TabItem from '@theme/TabItem';
# Retrieval Augmented Generation (RAG) # Retrieval Augmented Generation (RAG)
RAG enables your applications to reference and recall information from previous interactions or external documents.
RAG enables your applications to reference and recall information from external documents. Llama Stack makes Agentic RAG available through OpenAI's Responses API.
## Quick Start
### 1. Start the Server
In one terminal, start the Llama Stack server:
```bash
llama stack list-deps starter | xargs -L1 uv pip install
llama stack run starter
```
### 2. Connect with OpenAI Client
In another terminal, use the standard OpenAI client with the Responses API:
```python
import io, requests
from openai import OpenAI
url = "https://www.paulgraham.com/greatwork.html"
client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
# Create vector store - auto-detects default embedding model
vs = client.vector_stores.create()
response = requests.get(url)
pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
file_id = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants").id
client.vector_stores.files.create(vector_store_id=vs.id, file_id=file_id)
resp = client.responses.create(
model="gpt-4o",
input="How do you do great work? Use the existing knowledge_search tool.",
tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
include=["file_search_call.results"],
)
print(resp.output[-1].content[-1].text)
```
Which should give output like:
```
Doing great work is about more than just hard work and ambition; it involves combining several elements:
1. **Pursue What Excites You**: Engage in projects that are both ambitious and exciting to you. It's important to work on something you have a natural aptitude for and a deep interest in.
2. **Explore and Discover**: Great work often feels like a blend of discovery and creation. Focus on seeing possibilities and let ideas take their natural shape, rather than just executing a plan.
3. **Be Bold Yet Flexible**: Take bold steps in your work without over-planning. An adaptable approach that evolves with new ideas can often lead to breakthroughs.
4. **Work on Your Own Projects**: Develop a habit of working on projects of your own choosing, as these often lead to great achievements. These should be projects you find exciting and that challenge you intellectually.
5. **Be Earnest and Authentic**: Approach your work with earnestness and authenticity. Trying to impress others with affectation can be counterproductive, as genuine effort and intellectual honesty lead to better work outcomes.
6. **Build a Supportive Environment**: Work alongside great colleagues who inspire you and enhance your work. Surrounding yourself with motivating individuals creates a fertile environment for great work.
7. **Maintain High Morale**: High morale significantly impacts your ability to do great work. Stay optimistic and protect your mental well-being to maintain progress and momentum.
8. **Balance**: While hard work is essential, overworking can lead to diminishing returns. Balance periods of intensive work with rest to sustain productivity over time.
This approach shows that great work is less about following a strict formula and more about aligning your interests, ambition, and environment to foster creativity and innovation.
```
## Architecture Overview ## Architecture Overview
Llama Stack organizes the APIs that enable RAG into three layers: Llama Stack provides OpenAI-compatible RAG capabilities through:
1. **Lower-Level APIs**: Deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon) - **Vector Stores API**: OpenAI-compatible vector storage with automatic embedding model detection
2. **RAG Tool**: A first-class tool as part of the [Tools API](./tools) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly - **Files API**: Document upload and processing using OpenAI's file format
3. **Agents API**: The top-level [Agents API](./agent) that allows you to create agents that can use the tools to answer questions, perform tasks, and more - **Responses API**: Enhanced chat completions with agentic tool calling via file search
![RAG System Architecture](/img/rag.png) ## Configuring Default Embedding Models
The RAG system uses lower-level storage for different types of data: To enable automatic vector store creation without specifying embedding models, configure a default embedding model in your run.yaml like so:
- **Vector IO**: For semantic search and retrieval
- **Key-Value and Relational IO**: For structured data storage
:::info[Future Storage Types] ```yaml
We may add more storage types like Graph IO in the future. vector_stores:
::: default_provider_id: faiss
default_embedding_model:
## Setting up Vector Databases provider_id: sentence-transformers
model_id: nomic-ai/nomic-embed-text-v1.5
For this guide, we will use [Ollama](https://ollama.com/) as the inference provider. Ollama is an LLM runtime that allows you to run Llama models locally.
Here's how to set up a vector database for RAG:
```python
# Create HTTP client
import os
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
# Register a vector database
vector_db_id = "my_documents"
response = client.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model="nomic-embed-text-v1.5",
embedding_dimension=768,
provider_id="faiss",
)
``` ```
## Document Ingestion With this configuration:
- `client.vector_stores.create()` works without requiring embedding model or provider parameters
- The system automatically uses the default vector store provider (`faiss`) when multiple providers are available
- The system automatically uses the default embedding model (`sentence-transformers/nomic-ai/nomic-embed-text-v1.5`) for any newly created vector store
- The `default_provider_id` specifies which vector storage backend to use
- The `default_embedding_model` specifies both the inference provider and model for embeddings
You can ingest documents into the vector database using two methods: directly inserting pre-chunked documents or using the RAG Tool. ## Vector Store Operations
### Direct Document Insertion ### Creating Vector Stores
<Tabs> You can create vector stores with automatic or explicit embedding model selection:
<TabItem value="basic" label="Basic Insertion">
```python ```python
# You can insert a pre-chunked document directly into the vector db # Automatic - uses default configured embedding model and vector store provider
chunks = [ vs = client.vector_stores.create()
{
"content": "Your document text here",
"mime_type": "text/plain",
"metadata": {
"document_id": "doc1",
"author": "Jane Doe",
},
},
]
client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
```
</TabItem> # Explicit - specify embedding model and/or provider when you need specific ones
<TabItem value="embeddings" label="With Precomputed Embeddings"> vs = client.vector_stores.create(
extra_body={
If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you want to customize the ingestion process. "provider_id": "faiss", # Optional: specify vector store provider
"embedding_model": "sentence-transformers/nomic-ai/nomic-embed-text-v1.5",
```python "embedding_dimension": 768 # Optional: will be auto-detected if not provided
chunks_with_embeddings = [ }
{
"content": "First chunk of text",
"mime_type": "text/plain",
"embedding": [0.1, 0.2, 0.3, ...], # Your precomputed embedding vector
"metadata": {"document_id": "doc1", "section": "introduction"},
},
{
"content": "Second chunk of text",
"mime_type": "text/plain",
"embedding": [0.2, 0.3, 0.4, ...], # Your precomputed embedding vector
"metadata": {"document_id": "doc1", "section": "methodology"},
},
]
client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
```
:::warning[Embedding Dimensions]
When providing precomputed embeddings, ensure the embedding dimension matches the `embedding_dimension` specified when registering the vector database.
:::
</TabItem>
</Tabs>
### Document Retrieval
You can query the vector database to retrieve documents based on their embeddings.
```python
# You can then query for these chunks
chunks_response = client.vector_io.query(
vector_db_id=vector_db_id,
query="What do you know about..."
) )
``` ```
## Using the RAG Tool
:::danger[Deprecation Notice]
The RAG Tool is being deprecated in favor of directly using the OpenAI-compatible Search API. We recommend migrating to the OpenAI APIs for better compatibility and future support.
:::
A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the [appendix](#more-ragdocument-examples).
### OpenAI API Integration & Migration
The RAG tool has been updated to use OpenAI-compatible APIs. This provides several benefits:
- **Files API Integration**: Documents are now uploaded using OpenAI's file upload endpoints
- **Vector Stores API**: Vector storage operations use OpenAI's vector store format with configurable chunking strategies
- **Error Resilience**: When processing multiple documents, individual failures are logged but don't crash the operation. Failed documents are skipped while successful ones continue processing.
### Migration Path
We recommend migrating to the OpenAI-compatible Search API for:
1. **Better OpenAI Ecosystem Integration**: Direct compatibility with OpenAI tools and workflows including the Responses API
2. **Future-Proof**: Continued support and feature development
3. **Full OpenAI Compatibility**: Vector Stores, Files, and Search APIs are fully compatible with OpenAI's Responses API
The OpenAI APIs are used under the hood, so you can continue to use your existing RAG Tool code with minimal changes. However, we recommend updating your code to use the new OpenAI-compatible APIs for better long-term support. If any documents fail to process, they will be logged in the response but will not cause the entire operation to fail.
### RAG Tool Example
```python
from llama_stack_client import RAGDocument
urls = ["memory_optimizations.rst", "chat.rst", "llama3.rst"]
documents = [
RAGDocument(
document_id=f"num-{i}",
content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
mime_type="text/plain",
metadata={},
)
for i, url in enumerate(urls)
]
client.tool_runtime.rag_tool.insert(
documents=documents,
vector_db_id=vector_db_id,
chunk_size_in_tokens=512,
)
# Query documents
results = client.tool_runtime.rag_tool.query(
vector_db_ids=[vector_db_id],
content="What do you know about...",
)
```
### Custom Context Configuration
You can configure how the RAG tool adds metadata to the context if you find it useful for your application:
```python
# Query documents with custom template
results = client.tool_runtime.rag_tool.query(
vector_db_ids=[vector_db_id],
content="What do you know about...",
query_config={
"chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
},
)
```
## Building RAG-Enhanced Agents
One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
### Agent with Knowledge Search
```python
from llama_stack_client import Agent
# Create agent with memory
agent = Agent(
client,
model="meta-llama/Llama-3.3-70B-Instruct",
instructions="You are a helpful assistant",
tools=[
{
"name": "builtin::rag/knowledge_search",
"args": {
"vector_db_ids": [vector_db_id],
# Defaults
"query_config": {
"chunk_size_in_tokens": 512,
"chunk_overlap_in_tokens": 0,
"chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
},
},
}
],
)
session_id = agent.create_session("rag_session")
# Ask questions about documents in the vector db, and the agent will query the db to answer the question.
response = agent.create_turn(
messages=[{"role": "user", "content": "How to optimize memory in PyTorch?"}],
session_id=session_id,
)
```
:::tip[Agent Instructions]
The `instructions` field in the `AgentConfig` can be used to guide the agent's behavior. It is important to experiment with different instructions to see what works best for your use case.
:::
### Document-Aware Conversations
You can also pass documents along with the user's message and ask questions about them:
```python
# Initial document ingestion
response = agent.create_turn(
messages=[
{"role": "user", "content": "I am providing some documents for reference."}
],
documents=[
{
"content": "https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/memory_optimizations.rst",
"mime_type": "text/plain",
}
],
session_id=session_id,
)
# Query with RAG
response = agent.create_turn(
messages=[{"role": "user", "content": "What are the key topics in the documents?"}],
session_id=session_id,
)
```
### Viewing Agent Responses
You can print the response with the following:
```python
from llama_stack_client import AgentEventLogger
for log in AgentEventLogger().log(response):
log.print()
```
## Vector Database Management
### Unregistering Vector DBs
If you need to clean up and unregister vector databases, you can do so as follows:
<Tabs>
<TabItem value="single" label="Single Database">
```python
# Unregister a specified vector database
vector_db_id = "my_vector_db_id"
print(f"Unregistering vector database: {vector_db_id}")
client.vector_dbs.unregister(vector_db_id=vector_db_id)
```
</TabItem>
<TabItem value="all" label="All Databases">
```python
# Unregister all vector databases
for vector_db_id in client.vector_dbs.list():
print(f"Unregistering vector database: {vector_db_id.identifier}")
client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
```
</TabItem>
</Tabs>
## Best Practices
### 🎯 **Document Chunking**
- Use appropriate chunk sizes (512 tokens is often a good starting point)
- Consider overlap between chunks for better context preservation
- Experiment with different chunking strategies for your content type
### 🔍 **Embedding Strategy**
- Choose embedding models that match your domain
- Consider the trade-off between embedding dimension and performance
- Test different embedding models for your specific use case
### 📊 **Query Optimization**
- Use specific, well-formed queries for better retrieval
- Experiment with different search strategies
- Consider hybrid approaches (keyword + semantic search)
### 🛡️ **Error Handling**
- Implement proper error handling for failed document processing
- Monitor ingestion success rates
- Have fallback strategies for retrieval failures
## Appendix
### More RAGDocument Examples
Here are various ways to create RAGDocument objects for different content types:
```python
from llama_stack_client import RAGDocument
import base64
# File URI
RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
# Plain text
RAGDocument(document_id="num-1", content="plain text")
# Explicit text input
RAGDocument(
document_id="num-2",
content={
"type": "text",
"text": "plain text input",
}, # for inputs that should be treated as text explicitly
)
# Image from URL
RAGDocument(
document_id="num-3",
content={
"type": "image",
"image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
},
)
# Base64 encoded image
B64_ENCODED_IMAGE = base64.b64encode(
requests.get(
"https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
).content
)
RAGDocument(
document_id="num-4",
content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
)
```
For more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).

View file

@ -10,58 +10,8 @@ import TabItem from '@theme/TabItem';
# Telemetry # Telemetry
The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output for complete observability of your AI applications. The Llama Stack uses OpenTelemetry to provide comprehensive tracing, metrics, and logging capabilities.
## Event Types
The telemetry system supports three main types of events:
<Tabs>
<TabItem value="unstructured" label="Unstructured Logs">
Free-form log messages with severity levels for general application logging:
```python
unstructured_log_event = UnstructuredLogEvent(
message="This is a log message",
severity=LogSeverity.INFO
)
```
</TabItem>
<TabItem value="metrics" label="Metric Events">
Numerical measurements with units for tracking performance and usage:
```python
metric_event = MetricEvent(
metric="my_metric",
value=10,
unit="count"
)
```
</TabItem>
<TabItem value="structured" label="Structured Logs">
System events like span start/end that provide structured operation tracking:
```python
structured_log_event = SpanStartPayload(
name="my_span",
parent_span_id="parent_span_id"
)
```
</TabItem>
</Tabs>
## Spans and Traces
- **Spans**: Represent individual operations with timing information and hierarchical relationships
- **Traces**: Collections of related spans that form a complete request flow across your application
This hierarchical structure allows you to understand the complete execution path of requests through your Llama Stack application.
## Automatic Metrics Generation ## Automatic Metrics Generation
@ -129,21 +79,6 @@ Send events to an OpenTelemetry Collector for integration with observability pla
- Compatible with all OpenTelemetry collectors - Compatible with all OpenTelemetry collectors
- Supports both traces and metrics - Supports both traces and metrics
</TabItem>
<TabItem value="sqlite" label="SQLite">
Store events in a local SQLite database for direct querying:
**Use Cases:**
- Local development and debugging
- Custom analytics and reporting
- Offline analysis of application behavior
**Features:**
- Direct SQL querying capabilities
- Persistent local storage
- No external dependencies
</TabItem> </TabItem>
<TabItem value="console" label="Console"> <TabItem value="console" label="Console">
@ -174,9 +109,8 @@ telemetry:
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: config:
service_name: "llama-stack-service" service_name: "llama-stack-service"
sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric'] sinks: ['console', 'otel_trace', 'otel_metric']
otel_exporter_otlp_endpoint: "http://localhost:4318" otel_exporter_otlp_endpoint: "http://localhost:4318"
sqlite_db_path: "/path/to/telemetry.db"
``` ```
### Environment Variables ### Environment Variables
@ -185,7 +119,7 @@ Configure telemetry behavior using environment variables:
- **`OTEL_EXPORTER_OTLP_ENDPOINT`**: OpenTelemetry Collector endpoint (default: `http://localhost:4318`) - **`OTEL_EXPORTER_OTLP_ENDPOINT`**: OpenTelemetry Collector endpoint (default: `http://localhost:4318`)
- **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string) - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`) - **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `[]`)
### Quick Setup: Complete Telemetry Stack ### Quick Setup: Complete Telemetry Stack
@ -248,37 +182,10 @@ Forward metrics to other observability systems:
</TabItem> </TabItem>
</Tabs> </Tabs>
## SQLite Querying
The `sqlite` sink allows you to query traces without an external system. This is particularly useful for development and custom analytics.
### Example Queries
```sql
-- Query recent traces
SELECT * FROM traces WHERE timestamp > datetime('now', '-1 hour');
-- Analyze span durations
SELECT name, AVG(duration_ms) as avg_duration
FROM spans
GROUP BY name
ORDER BY avg_duration DESC;
-- Find slow operations
SELECT * FROM spans
WHERE duration_ms > 1000
ORDER BY duration_ms DESC;
```
:::tip[Advanced Analytics]
Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on querying traces and spans programmatically.
:::
## Best Practices ## Best Practices
### 🔍 **Monitoring Strategy** ### 🔍 **Monitoring Strategy**
- Use OpenTelemetry for production environments - Use OpenTelemetry for production environments
- Combine multiple sinks for development (console + SQLite)
- Set up alerts on key metrics like token usage and error rates - Set up alerts on key metrics like token usage and error rates
### 📊 **Metrics Analysis** ### 📊 **Metrics Analysis**
@ -293,45 +200,8 @@ Refer to the [Getting Started notebook](https://github.com/meta-llama/llama-stac
### 🔧 **Configuration Management** ### 🔧 **Configuration Management**
- Use environment variables for flexible deployment - Use environment variables for flexible deployment
- Configure appropriate retention policies for SQLite
- Ensure proper network access to OpenTelemetry collectors - Ensure proper network access to OpenTelemetry collectors
## Integration Examples
### Basic Telemetry Setup
```python
from llama_stack_client import LlamaStackClient
# Client with telemetry headers
client = LlamaStackClient(
base_url="http://localhost:8000",
extra_headers={
"X-Telemetry-Service": "my-ai-app",
"X-Telemetry-Version": "1.0.0"
}
)
# All API calls will be automatically traced
response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct",
messages=[{"role": "user", "content": "Hello!"}]
)
```
### Custom Telemetry Context
```python
# Add custom span attributes for better tracking
with tracer.start_as_current_span("custom_operation") as span:
span.set_attribute("user_id", "user123")
span.set_attribute("operation_type", "chat_completion")
response = client.chat.completions.create(
model="meta-llama/Llama-3.2-3B-Instruct",
messages=[{"role": "user", "content": "Hello!"}]
)
```
## Related Resources ## Related Resources

View file

@ -62,6 +62,10 @@ The new `/v2` API must be introduced alongside the existing `/v1` API and run in
When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate. When a `/v2` API is introduced, a clear and generous deprecation policy for the `/v1` API must be published simultaneously. This policy must outline the timeline for the eventual removal of the `/v1` API, giving users ample time to migrate.
### Deprecated APIs
Deprecated APIs are those that are no longer actively maintained or supported. Depreated APIs are marked with the flag `deprecated = True` in the OpenAPI spec. These APIs will be removed in a future release.
### API Stability vs. Provider Stability ### API Stability vs. Provider Stability
The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API. The leveling introduced in this document relates to the stability of the API and not specifically the providers within the API.

View file

@ -158,17 +158,16 @@ under the LICENSE file in the root directory of this source tree.
Some tips about common tasks you work on while contributing to Llama Stack: Some tips about common tasks you work on while contributing to Llama Stack:
### Using `llama stack build` ### Setup for development
Building a stack image will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
Example:
```bash ```bash
cd work/
git clone https://github.com/meta-llama/llama-stack.git git clone https://github.com/meta-llama/llama-stack.git
git clone https://github.com/meta-llama/llama-stack-client-python.git
cd llama-stack cd llama-stack
LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --distro <...> uv run llama stack list-deps <distro-name> | xargs -L1 uv pip install
# (Optional) If you are developing the llama-stack-client-python package, you can add it as an editable package.
git clone https://github.com/meta-llama/llama-stack-client-python.git
uv add --editable ../llama-stack-client-python
``` ```
### Updating distribution configurations ### Updating distribution configurations

View file

@ -67,7 +67,7 @@ def get_base_url(self) -> str:
## Testing the Provider ## Testing the Provider
Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --distro together`. Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, install its dependencies with `llama stack list-deps together | xargs -L1 uv pip install`.
### 1. Integration Testing ### 1. Integration Testing

View file

@ -5,225 +5,80 @@ sidebar_label: Build your own Distribution
sidebar_position: 3 sidebar_position: 3
--- ---
This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers. This guide walks you through inspecting existing distributions, customising their configuration, and building runnable artefacts for your own deployment.
### Explore existing distributions
### Setting your log level All first-party distributions live under `llama_stack/distributions/`. Each directory contains:
In order to specify the proper logging level users can apply the following environment variable `LLAMA_STACK_LOGGING` with the following format: - `build.yaml` the distribution specification (providers, additional dependencies, optional external provider directories).
- `run.yaml` sample run configuration (when provided).
- Documentation fragments that power this site.
`LLAMA_STACK_LOGGING=server=debug;core=info` Browse that folder to understand available providers and copy a distribution to use as a starting point. When creating a new stack, duplicate an existing directory, rename it, and adjust the `build.yaml` file to match your requirements.
Where each category in the following list:
- all
- core
- server
- router
- inference
- agents
- safety
- eval
- tools
- client
Can be set to any of the following log levels:
- debug
- info
- warning
- error
- critical
The default global log level is `info`. `all` sets the log level for all components.
A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log`
### Llama Stack Build
In order to build your own distribution, we recommend you clone the `llama-stack` repository.
```
git clone git@github.com:meta-llama/llama-stack.git
cd llama-stack
pip install -e .
```
Use the CLI to build your distribution.
The main points to consider are:
1. **Image Type** - Do you want a venv environment or a Container (eg. Docker)
2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
3. **Config** - Do you want to use a pre-existing config file to build your distribution?
```
llama stack build -h
usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--distro DISTRIBUTION] [--list-distros] [--image-type {container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
[--run] [--providers PROVIDERS]
Build a Llama stack container
options:
-h, --help show this help message and exit
--config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack.cores/**/build.yaml. If this argument is not provided, you will be prompted to
enter information interactively (default: None)
--template TEMPLATE (deprecated) Name of the example template config to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default:
None)
--distro DISTRIBUTION, --distribution DISTRIBUTION
Name of the distribution to use for build. You may use `llama stack build --list-distros` to check out the available distributions (default: None)
--list-distros, --list-distributions
Show the available distributions for building a Llama Stack distribution (default: False)
--image-type {container,venv}
Image Type to use for the build. If not specified, will use the image type from the template config. (default: None)
--image-name IMAGE_NAME
[for image-type=container|venv] Name of the virtual environment to use for the build. If not specified, currently active environment will be used if found. (default:
None)
--print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
--run Run the stack after building using the same image type, name, and other applicable arguments (default: False)
--providers PROVIDERS
Build a config for a list of providers and only those providers. This list is formatted like: api1=provider1,api2=provider2. Where there can be multiple providers per
API. (default: None)
```
After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
import Tabs from '@theme/Tabs'; import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem'; import TabItem from '@theme/TabItem';
<Tabs> <Tabs>
<TabItem value="template" label="Building from a template"> <TabItem value="container" label="Building a container">
To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
The following command will allow you to see the available templates and their corresponding providers. Use the Containerfile at `containers/Containerfile`, which installs `llama-stack`, resolves distribution dependencies via `llama stack list-deps`, and sets the entrypoint to `llama stack run`.
```
llama stack build --list-templates ```bash
docker build . \
-f containers/Containerfile \
--build-arg DISTRO_NAME=starter \
--tag llama-stack:starter
``` ```
``` Handy build arguments:
------------------------------+-----------------------------------------------------------------------------+
| Template Name | Description |
+------------------------------+-----------------------------------------------------------------------------+
| watsonx | Use watsonx for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| vllm-gpu | Use a built-in vLLM engine for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| together | Use Together.AI for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| tgi | Use (an external) TGI server for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| starter | Quick start template for running Llama Stack with several popular providers |
+------------------------------+-----------------------------------------------------------------------------+
| sambanova | Use SambaNova for running LLM inference and safety |
+------------------------------+-----------------------------------------------------------------------------+
| remote-vllm | Use (an external) vLLM server for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| postgres-demo | Quick start template for running Llama Stack with several popular providers |
+------------------------------+-----------------------------------------------------------------------------+
| passthrough | Use Passthrough hosted llama-stack endpoint for LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| open-benchmark | Distribution for running open benchmarks |
+------------------------------+-----------------------------------------------------------------------------+
| ollama | Use (an external) Ollama server for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| nvidia | Use NVIDIA NIM for running LLM inference, evaluation and safety |
+------------------------------+-----------------------------------------------------------------------------+
| meta-reference-gpu | Use Meta Reference for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| llama_api | Distribution for running e2e tests in CI |
+------------------------------+-----------------------------------------------------------------------------+
| hf-serverless | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| hf-endpoint | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| groq | Use Groq for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| fireworks | Use Fireworks.AI for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| experimental-post-training | Experimental template for post training |
+------------------------------+-----------------------------------------------------------------------------+
| dell | Dell's distribution of Llama Stack. TGI inference via Dell's custom |
| | container |
+------------------------------+-----------------------------------------------------------------------------+
| ci-tests | Distribution for running e2e tests in CI |
+------------------------------+-----------------------------------------------------------------------------+
| cerebras | Use Cerebras for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| bedrock | Use AWS Bedrock for running LLM inference and safety |
+------------------------------+-----------------------------------------------------------------------------+
```
You may then pick a template to build your distribution with providers fitted to your liking. - `DISTRO_NAME` distribution directory name (defaults to `starter`).
- `RUN_CONFIG_PATH` absolute path inside the build context for a run config that should be baked into the image (e.g. `/workspace/run.yaml`).
- `INSTALL_MODE=editable` install the repository copied into `/workspace` with `uv pip install -e`. Pair it with `--build-arg LLAMA_STACK_DIR=/workspace`.
- `LLAMA_STACK_CLIENT_DIR` optional editable install of the Python client.
- `PYPI_VERSION` / `TEST_PYPI_VERSION` pin specific releases when not using editable installs.
- `KEEP_WORKSPACE=1` retain `/workspace` in the final image if you need to access additional files (such as sample configs or provider bundles).
For example, to build a distribution with TGI as the inference provider, you can run: Make sure any custom `build.yaml`, run configs, or provider directories you reference are included in the Docker build context so the Containerfile can read them.
```
$ llama stack build --distro starter
...
You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
```
```{tip}
The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
```
</TabItem> </TabItem>
<TabItem value="scratch" label="Building from Scratch"> <TabItem value="external" label="Building with external providers">
If the provided templates do not fit your use case, you could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations. External providers live outside the main repository but can be bundled by pointing `external_providers_dir` to a directory that contains your provider packages.
It would be best to start with a template and understand the structure of the config file and the various concepts ( APIS, providers, resources, etc.) before starting from scratch. 1. Copy providers into the build context, for example `cp -R path/to/providers providers.d`.
``` 2. Update `build.yaml` with the directory and provider entries.
llama stack build 3. Adjust run configs to use the in-container path (usually `/.llama/providers.d`). Pass `--build-arg RUN_CONFIG_PATH=/workspace/run.yaml` if you want to bake the config.
> Enter a name for your Llama Stack (e.g. my-local-stack): my-stack Example `build.yaml` excerpt for a custom Ollama provider:
> Enter the image type you want your Llama Stack to be built as (container or venv): venv
Llama Stack is composed of several APIs working together. Let's select
the provider types (implementations) you want to use for these APIs.
Tip: use <TAB> to see options for the providers.
> Enter provider for API inference: inline::meta-reference
> Enter provider for API safety: inline::llama-guard
> Enter provider for API agents: inline::meta-reference
> Enter provider for API memory: inline::faiss
> Enter provider for API datasetio: inline::meta-reference
> Enter provider for API scoring: inline::meta-reference
> Enter provider for API eval: inline::meta-reference
> Enter provider for API telemetry: inline::meta-reference
> (Optional) Enter a short description for your Llama Stack:
You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml`
```
</TabItem>
<TabItem value="config" label="Building from a pre-existing build config file">
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/distributions/*build.yaml`.
```
llama stack build --config llama_stack/distributions/starter/build.yaml
```
</TabItem>
<TabItem value="external" label="Building with External Providers">
Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently or use community-provided providers.
To build a distribution with external providers, you need to:
1. Configure the `external_providers_dir` in your build configuration file:
```yaml ```yaml
# Example my-external-stack.yaml with external providers
version: '2'
distribution_spec: distribution_spec:
description: Custom distro for CI tests
providers: providers:
inference: inference:
- remote::custom_ollama - remote::custom_ollama
# Add more providers as needed external_providers_dir: /workspace/providers.d
image_type: container ```
image_name: ci-test
# Path to external provider implementations Inside `providers.d/custom_ollama/provider.py`, define `get_provider_spec()` so the CLI can discover dependencies:
external_providers_dir: ~/.llama/providers.d
```python
from llama_stack.providers.datatypes import ProviderSpec
def get_provider_spec() -> ProviderSpec:
return ProviderSpec(
provider_type="remote::custom_ollama",
module="llama_stack_ollama_provider",
config_class="llama_stack_ollama_provider.config.OllamaImplConfig",
pip_packages=[
"ollama",
"aiohttp",
"llama-stack-provider-ollama",
],
)
``` ```
Here's an example for a custom Ollama provider: Here's an example for a custom Ollama provider:
@ -232,9 +87,9 @@ Here's an example for a custom Ollama provider:
adapter: adapter:
adapter_type: custom_ollama adapter_type: custom_ollama
pip_packages: pip_packages:
- ollama - ollama
- aiohttp - aiohttp
- llama-stack-provider-ollama # This is the provider package - llama-stack-provider-ollama # This is the provider package
config_class: llama_stack_ollama_provider.config.OllamaImplConfig config_class: llama_stack_ollama_provider.config.OllamaImplConfig
module: llama_stack_ollama_provider module: llama_stack_ollama_provider
api_dependencies: [] api_dependencies: []
@ -245,53 +100,22 @@ The `pip_packages` section lists the Python packages required by the provider, a
provider package itself. The package must be available on PyPI or can be provided from a local provider package itself. The package must be available on PyPI or can be provided from a local
directory or a git repository (git must be installed on the build environment). directory or a git repository (git must be installed on the build environment).
2. Build your distribution using the config file: For deeper guidance, see the [External Providers documentation](../providers/external/).
```
llama stack build --config my-external-stack.yaml
```
For more information on external providers, including directory structure, provider types, and implementation requirements, see the [External Providers documentation](../providers/external/).
</TabItem> </TabItem>
<TabItem value="container" label="Building Container"> </Tabs>
:::tip Podman Alternative ### Run your stack server
Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
:::
To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type. After building the image, launch it directly with Docker or Podman—the entrypoint calls `llama stack run` using the baked distribution or the bundled run config:
```
llama stack build --distro starter --image-type container
```
```
$ llama stack build --distro starter --image-type container
...
Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
...
```
You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
```
Now set some environment variables for the inference model ID and Llama Stack Port and create a local directory to mount into the container's file system.
```bash ```bash
export INFERENCE_MODEL="llama3.2:3b"
export LLAMA_STACK_PORT=8321
mkdir -p ~/.llama
```
After this step is successful, you should be able to find the built container image and test it with the below Docker command:
```
docker run -d \ docker run -d \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \ -v ~/.llama:/root/.llama \
-e INFERENCE_MODEL=$INFERENCE_MODEL \ -e INFERENCE_MODEL=$INFERENCE_MODEL \
-e OLLAMA_URL=http://host.docker.internal:11434 \ -e OLLAMA_URL=http://host.docker.internal:11434 \
localhost/distribution-ollama:dev \ llama-stack:starter \
--port $LLAMA_STACK_PORT --port $LLAMA_STACK_PORT
``` ```
@ -311,131 +135,14 @@ Here are the docker flags and their uses:
* `--port $LLAMA_STACK_PORT`: Port number for the server to listen on * `--port $LLAMA_STACK_PORT`: Port number for the server to listen on
</TabItem>
</Tabs>
### Running your Stack server If you prepared a custom run config, mount it into the container and reference it explicitly:
Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.
```bash
docker run \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v $(pwd)/run.yaml:/app/run.yaml \
llama-stack:starter \
/app/run.yaml
``` ```
llama stack run -h
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME]
[--image-type {venv}] [--enable-ui]
[config | distro]
Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
positional arguments:
config | distro Path to config file to use for the run or name of known distro (`llama stack list` for a list). (default: None)
options:
-h, --help show this help message and exit
--port PORT Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
--image-name IMAGE_NAME
[DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
--image-type {venv}
[DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
--enable-ui Start the UI server (default: False)
```
**Note:** Container images built with `llama stack build --image-type container` cannot be run using `llama stack run`. Instead, they must be run directly using Docker or Podman commands as shown in the container building section above.
```
# Start using template name
llama stack run tgi
# Start using config file
llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
```
```
$ llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
Serving API inspect
GET /health
GET /providers/list
GET /routes/list
Serving API inference
POST /inference/chat_completion
POST /inference/completion
POST /inference/embeddings
...
Serving API agents
POST /agents/create
POST /agents/session/create
POST /agents/turn/create
POST /agents/delete
POST /agents/session/delete
POST /agents/session/get
POST /agents/step/get
POST /agents/turn/get
Listening on ['::', '0.0.0.0']:8321
INFO: Started server process [2935911]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://['::', '0.0.0.0']:8321 (Press CTRL+C to quit)
INFO: 2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200 OK
```
### Listing Distributions
Using the list command, you can view all existing Llama Stack distributions, including stacks built from templates, from scratch, or using custom configuration files.
```
llama stack list -h
usage: llama stack list [-h]
list the build stacks
options:
-h, --help show this help message and exit
```
Example Usage
```
llama stack list
```
```
------------------------------+-----------------------------------------------------------------+--------------+------------+
| Stack Name | Path | Build Config | Run Config |
+------------------------------+-----------------------------------------------------------------------------+--------------+
| together | ~/.llama/distributions/together | Yes | No |
+------------------------------+-----------------------------------------------------------------------------+--------------+
| bedrock | ~/.llama/distributions/bedrock | Yes | No |
+------------------------------+-----------------------------------------------------------------------------+--------------+
| starter | ~/.llama/distributions/starter | Yes | Yes |
+------------------------------+-----------------------------------------------------------------------------+--------------+
| remote-vllm | ~/.llama/distributions/remote-vllm | Yes | Yes |
+------------------------------+-----------------------------------------------------------------------------+--------------+
```
### Removing a Distribution
Use the remove command to delete a distribution you've previously built.
```
llama stack rm -h
usage: llama stack rm [-h] [--all] [name]
Remove the build stack
positional arguments:
name Name of the stack to delete (default: None)
options:
-h, --help show this help message and exit
--all, -a Delete all stacks (use with caution) (default: False)
```
Example
```
llama stack rm llamastack-test
```
To keep your environment organized and avoid clutter, consider using `llama stack list` to review old or unused distributions and `llama stack rm <name>` to delete them when they're no longer needed.
### Troubleshooting
If you encounter any issues, ask questions in our discord or search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.

View file

@ -44,18 +44,32 @@ providers:
- provider_id: meta-reference - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: config:
persistence_store: persistence:
type: sqlite agent_state:
namespace: null backend: kv_default
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db namespace: agents
responses:
backend: sql_default
table_name: responses
telemetry: telemetry:
- provider_id: meta-reference - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: {} config: {}
metadata_store: storage:
namespace: null backends:
type: sqlite kv_default:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db type: kv_sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/kvstore.db
sql_default:
type: sql_sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/sqlstore.db
references:
metadata:
backend: kv_default
namespace: registry
inference:
backend: sql_default
table_name: inference_store
models: models:
- metadata: {} - metadata: {}
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}

View file

@ -12,7 +12,7 @@ This avoids the overhead of setting up a server.
```bash ```bash
# setup # setup
uv pip install llama-stack uv pip install llama-stack
llama stack build --distro starter --image-type venv llama stack list-deps starter | xargs -L1 uv pip install
``` ```
```python ```python

View file

@ -1,56 +1,155 @@
apiVersion: v1 apiVersion: v1
data: data:
stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n- stack_run_config.yaml: |
inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n version: '2'
\ inference:\n - provider_id: vllm-inference\n provider_type: remote::vllm\n image_name: kubernetes-demo
\ config:\n url: ${env.VLLM_URL:=http://localhost:8000/v1}\n max_tokens: apis:
${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n tls_verify: - agents
${env.VLLM_TLS_VERIFY:=true}\n - provider_id: vllm-safety\n provider_type: - inference
remote::vllm\n config:\n url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n - files
\ max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n - safety
\ tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n - provider_id: sentence-transformers\n - telemetry
\ provider_type: inline::sentence-transformers\n config: {}\n vector_io:\n - tool_runtime
\ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n provider_type: remote::chromadb\n - vector_io
\ config:\n url: ${env.CHROMADB_URL:=}\n kvstore:\n type: postgres\n providers:
\ host: ${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n inference:
\ db: ${env.POSTGRES_DB:=llamastack}\n user: ${env.POSTGRES_USER:=llamastack}\n - provider_id: vllm-inference
\ password: ${env.POSTGRES_PASSWORD:=llamastack}\n files:\n - provider_id: provider_type: remote::vllm
meta-reference-files\n provider_type: inline::localfs\n config:\n storage_dir: config:
${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n metadata_store:\n url: ${env.VLLM_URL:=http://localhost:8000/v1}
\ type: sqlite\n db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
\ \n safety:\n - provider_id: llama-guard\n provider_type: inline::llama-guard\n api_token: ${env.VLLM_API_TOKEN:=fake}
\ config:\n excluded_categories: []\n agents:\n - provider_id: meta-reference\n tls_verify: ${env.VLLM_TLS_VERIFY:=true}
\ provider_type: inline::meta-reference\n config:\n persistence_store:\n - provider_id: vllm-safety
\ type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n port: provider_type: remote::vllm
${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user: config:
${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
\ responses_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
\ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n api_token: ${env.VLLM_API_TOKEN:=fake}
\ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n tls_verify: ${env.VLLM_TLS_VERIFY:=true}
\ telemetry:\n - provider_id: meta-reference\n provider_type: inline::meta-reference\n - provider_id: sentence-transformers
\ config:\n service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n sinks: provider_type: inline::sentence-transformers
${env.TELEMETRY_SINKS:=console}\n tool_runtime:\n - provider_id: brave-search\n config: {}
\ provider_type: remote::brave-search\n config:\n api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n vector_io:
\ max_results: 3\n - provider_id: tavily-search\n provider_type: remote::tavily-search\n - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
\ config:\n api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n max_results: provider_type: remote::chromadb
3\n - provider_id: rag-runtime\n provider_type: inline::rag-runtime\n config: config:
{}\n - provider_id: model-context-protocol\n provider_type: remote::model-context-protocol\n url: ${env.CHROMADB_URL:=}
\ config: {}\nmetadata_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n kvstore:
\ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user: type: postgres
${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n host: ${env.POSTGRES_HOST:=localhost}
\ table_name: llamastack_kvstore\ninference_store:\n type: postgres\n host: port: ${env.POSTGRES_PORT:=5432}
${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n db: ${env.POSTGRES_DB:=llamastack}
\ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n- user: ${env.POSTGRES_USER:=llamastack}
metadata:\n embedding_dimension: 384\n model_id: all-MiniLM-L6-v2\n provider_id: password: ${env.POSTGRES_PASSWORD:=llamastack}
sentence-transformers\n model_type: embedding\n- metadata: {}\n model_id: ${env.INFERENCE_MODEL}\n files:
\ provider_id: vllm-inference\n model_type: llm\n- metadata: {}\n model_id: - provider_id: meta-reference-files
${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n provider_id: vllm-safety\n provider_type: inline::localfs
\ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs: config:
[]\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id: storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
builtin::websearch\n provider_id: tavily-search\n- toolgroup_id: builtin::rag\n metadata_store:
\ provider_id: rag-runtime\nserver:\n port: 8321\n auth:\n provider_config:\n type: sqlite
\ type: github_token\n" db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
responses_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
storage:
backends:
kv_default:
type: kv_postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
sql_default:
type: sql_postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
references:
metadata:
backend: kv_default
namespace: registry
inference:
backend: sql_default
table_name: inference_store
models:
- metadata:
embedding_dimension: 768
model_id: nomic-embed-text-v1.5
provider_id: sentence-transformers
model_type: embedding
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- metadata: {}
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
provider_id: vllm-safety
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
server:
port: 8321
auth:
provider_config:
type: github_token
kind: ConfigMap kind: ConfigMap
metadata: metadata:
creationTimestamp: null
name: llama-stack-config name: llama-stack-config

View file

@ -93,21 +93,30 @@ providers:
- provider_id: model-context-protocol - provider_id: model-context-protocol
provider_type: remote::model-context-protocol provider_type: remote::model-context-protocol
config: {} config: {}
metadata_store: storage:
type: postgres backends:
host: ${env.POSTGRES_HOST:=localhost} kv_default:
port: ${env.POSTGRES_PORT:=5432} type: kv_postgres
db: ${env.POSTGRES_DB:=llamastack} host: ${env.POSTGRES_HOST:=localhost}
user: ${env.POSTGRES_USER:=llamastack} port: ${env.POSTGRES_PORT:=5432}
password: ${env.POSTGRES_PASSWORD:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
table_name: llamastack_kvstore user: ${env.POSTGRES_USER:=llamastack}
inference_store: password: ${env.POSTGRES_PASSWORD:=llamastack}
type: postgres table_name: ${env.POSTGRES_TABLE_NAME:=llamastack_kvstore}
host: ${env.POSTGRES_HOST:=localhost} sql_default:
port: ${env.POSTGRES_PORT:=5432} type: sql_postgres
db: ${env.POSTGRES_DB:=llamastack} host: ${env.POSTGRES_HOST:=localhost}
user: ${env.POSTGRES_USER:=llamastack} port: ${env.POSTGRES_PORT:=5432}
password: ${env.POSTGRES_PASSWORD:=llamastack} db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
references:
metadata:
backend: kv_default
namespace: registry
inference:
backend: sql_default
table_name: inference_store
models: models:
- metadata: - metadata:
embedding_dimension: 768 embedding_dimension: 768

View file

@ -59,7 +59,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
uv venv starter --python 3.12 uv venv starter --python 3.12
source starter/bin/activate # On Windows: starter\Scripts\activate source starter/bin/activate # On Windows: starter\Scripts\activate
pip install --no-cache llama-stack==0.2.2 pip install --no-cache llama-stack==0.2.2
llama stack build --distro starter --image-type venv llama stack list-deps starter | xargs -L1 uv pip install
export FIREWORKS_API_KEY=<SOME_KEY> export FIREWORKS_API_KEY=<SOME_KEY>
llama stack run starter --port 5050 llama stack run starter --port 5050
``` ```

View file

@ -166,10 +166,10 @@ docker run \
### Via venv ### Via venv
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. Install the distribution dependencies before launching:
```bash ```bash
llama stack build --distro dell --image-type venv llama stack list-deps dell | xargs -L1 uv pip install
INFERENCE_MODEL=$INFERENCE_MODEL \ INFERENCE_MODEL=$INFERENCE_MODEL \
DEH_URL=$DEH_URL \ DEH_URL=$DEH_URL \
CHROMA_URL=$CHROMA_URL \ CHROMA_URL=$CHROMA_URL \

View file

@ -21,7 +21,6 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
| inference | `inline::meta-reference` | | inference | `inline::meta-reference` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -82,10 +81,10 @@ docker run \
### Via venv ### Via venv
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. Make sure you have the Llama Stack CLI available.
```bash ```bash
llama stack build --distro meta-reference-gpu --image-type venv llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
llama stack run distributions/meta-reference-gpu/run.yaml \ llama stack run distributions/meta-reference-gpu/run.yaml \
--port 8321 --port 8321

View file

@ -16,7 +16,6 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
| post_training | `remote::nvidia` | | post_training | `remote::nvidia` |
| safety | `remote::nvidia` | | safety | `remote::nvidia` |
| scoring | `inline::basic` | | scoring | `inline::basic` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `inline::rag-runtime` | | tool_runtime | `inline::rag-runtime` |
| vector_io | `inline::faiss` | | vector_io | `inline::faiss` |
@ -137,11 +136,11 @@ docker run \
### Via venv ### Via venv
If you've set up your local development environment, you can also build the image using your local virtual environment. If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
```bash ```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
llama stack build --distro nvidia --image-type venv llama stack list-deps nvidia | xargs -L1 uv pip install
NVIDIA_API_KEY=$NVIDIA_API_KEY \ NVIDIA_API_KEY=$NVIDIA_API_KEY \
INFERENCE_MODEL=$INFERENCE_MODEL \ INFERENCE_MODEL=$INFERENCE_MODEL \
llama stack run ./run.yaml \ llama stack run ./run.yaml \

View file

@ -119,7 +119,7 @@ The following environment variables can be configured:
### Telemetry Configuration ### Telemetry Configuration
- `OTEL_SERVICE_NAME`: OpenTelemetry service name - `OTEL_SERVICE_NAME`: OpenTelemetry service name
- `TELEMETRY_SINKS`: Telemetry sinks (default: `console,sqlite`) - `TELEMETRY_SINKS`: Telemetry sinks (default: `[]`)
## Enabling Providers ## Enabling Providers
@ -169,7 +169,11 @@ docker run \
Ensure you have configured the starter distribution using the environment variables explained above. Ensure you have configured the starter distribution using the environment variables explained above.
```bash ```bash
uv run --with llama-stack llama stack build --distro starter --image-type venv --run # Install dependencies for the starter distribution
uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
# Run the server
uv run --with llama-stack llama stack run starter
``` ```
## Example Usage ## Example Usage
@ -216,7 +220,6 @@ The starter distribution uses SQLite for local storage of various components:
- **Files metadata**: `~/.llama/distributions/starter/files_metadata.db` - **Files metadata**: `~/.llama/distributions/starter/files_metadata.db`
- **Agents store**: `~/.llama/distributions/starter/agents_store.db` - **Agents store**: `~/.llama/distributions/starter/agents_store.db`
- **Responses store**: `~/.llama/distributions/starter/responses_store.db` - **Responses store**: `~/.llama/distributions/starter/responses_store.db`
- **Trace store**: `~/.llama/distributions/starter/trace_store.db`
- **Evaluation store**: `~/.llama/distributions/starter/meta_reference_eval.db` - **Evaluation store**: `~/.llama/distributions/starter/meta_reference_eval.db`
- **Dataset I/O stores**: Various HuggingFace and local filesystem stores - **Dataset I/O stores**: Various HuggingFace and local filesystem stores

View file

@ -23,6 +23,17 @@ Another simple way to start interacting with Llama Stack is to just spin up a co
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](../deploying/kubernetes_deployment) for more details. If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](../deploying/kubernetes_deployment) for more details.
## Configure logging
Control log output via environment variables before starting the server.
- `LLAMA_STACK_LOGGING` sets per-component levels, e.g. `LLAMA_STACK_LOGGING=server=debug;core=info`.
- Supported categories: `all`, `core`, `server`, `router`, `inference`, `agents`, `safety`, `eval`, `tools`, `client`.
- Levels: `debug`, `info`, `warning`, `error`, `critical` (default is `info`). Use `all=<level>` to apply globally.
- `LLAMA_STACK_LOG_FILE=/path/to/log` mirrors logs to a file while still printing to stdout.
Export these variables prior to running `llama stack run`, launching a container, or starting the server through any other pathway.
```{toctree} ```{toctree}
:maxdepth: 1 :maxdepth: 1
:hidden: :hidden:

View file

@ -4,65 +4,24 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
vector_db_id = "my_demo_vector_db" import io, requests
client = LlamaStackClient(base_url="http://localhost:8321") from openai import OpenAI
models = client.models.list() url="https://www.paulgraham.com/greatwork.html"
client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
# Select the first LLM and first embedding models vs = client.vector_stores.create()
model_id = next(m for m in models if m.model_type == "llm").identifier response = requests.get(url)
embedding_model_id = ( pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
em := next(m for m in models if m.model_type == "embedding") uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
).identifier client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
embedding_dimension = em.metadata["embedding_dimension"]
vector_db = client.vector_dbs.register( resp = client.responses.create(
vector_db_id=vector_db_id, model="openai/gpt-4o",
embedding_model=embedding_model_id, input="How do you do great work? Use the existing knowledge_search tool.",
embedding_dimension=embedding_dimension, tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
provider_id="faiss", include=["file_search_call.results"],
)
vector_db_id = vector_db.identifier
source = "https://www.paulgraham.com/greatwork.html"
print("rag_tool> Ingesting document:", source)
document = RAGDocument(
document_id="document_1",
content=source,
mime_type="text/html",
metadata={},
)
client.tool_runtime.rag_tool.insert(
documents=[document],
vector_db_id=vector_db_id,
chunk_size_in_tokens=100,
)
agent = Agent(
client,
model=model_id,
instructions="You are a helpful assistant",
tools=[
{
"name": "builtin::rag/knowledge_search",
"args": {"vector_db_ids": [vector_db_id]},
}
],
) )
prompt = "How do you do great work?" print(resp)
print("prompt>", prompt)
use_stream = True
response = agent.create_turn(
messages=[{"role": "user", "content": prompt}],
session_id=agent.create_session("rag_session"),
stream=use_stream,
)
# Only call `AgentEventLogger().log(response)` for streaming responses.
if use_stream:
for log in AgentEventLogger().log(response):
log.print()
else:
print(response)

View file

@ -58,15 +58,19 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th
<Tabs> <Tabs>
<TabItem value="venv" label="Using venv"> <TabItem value="venv" label="Using venv">
You can use Python to build and run the Llama Stack server, which is useful for testing and development. You can use Python to install dependencies and run the Llama Stack server, which is useful for testing and development.
Llama Stack uses a [YAML configuration file](../distributions/configuration) to specify the stack setup, Llama Stack uses a [YAML configuration file](../distributions/configuration) to specify the stack setup,
which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml). which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml).
Now let's build and run the Llama Stack config for Ollama. Now let's install dependencies and run the Llama Stack config for Ollama.
We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables. We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.
```bash ```bash
llama stack build --distro starter --image-type venv --run # Install dependencies for the starter distribution
uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
# Run the server
llama stack run starter
``` ```
</TabItem> </TabItem>
<TabItem value="container" label="Using a Container"> <TabItem value="container" label="Using a Container">
@ -304,7 +308,7 @@ stream = agent.create_turn(
for event in AgentEventLogger().log(stream): for event in AgentEventLogger().log(stream):
event.print() event.print()
``` ```
### ii. Run the Script #### ii. Run the Script
Let's run the script using `uv` Let's run the script using `uv`
```bash ```bash
uv run python agent.py uv run python agent.py

View file

@ -24,111 +24,62 @@ ollama run llama3.2:3b --keepalive 60m
#### Step 2: Run the Llama Stack server #### Step 2: Run the Llama Stack server
We will use `uv` to run the Llama Stack server. We will use `uv` to install dependencies and run the Llama Stack server.
```bash ```bash
OLLAMA_URL=http://localhost:11434 \ # Install dependencies for the starter distribution
uv run --with llama-stack llama stack build --distro starter --image-type venv --run uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
# Run the server
OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter
``` ```
#### Step 3: Run the demo #### Step 3: Run the demo
Now open up a new terminal and copy the following script into a file named `demo_script.py`. Now open up a new terminal and copy the following script into a file named `demo_script.py`.
```python title="demo_script.py" ```python
# Copyright (c) Meta Platforms, Inc. and affiliates. import io, requests
# All rights reserved. from openai import OpenAI
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient url="https://www.paulgraham.com/greatwork.html"
client = OpenAI(base_url="http://localhost:8321/v1/", api_key="none")
vector_db_id = "my_demo_vector_db" vs = client.vector_stores.create()
client = LlamaStackClient(base_url="http://localhost:8321") response = requests.get(url)
pseudo_file = io.BytesIO(str(response.content).encode('utf-8'))
uploaded_file = client.files.create(file=(url, pseudo_file, "text/html"), purpose="assistants")
client.vector_stores.files.create(vector_store_id=vs.id, file_id=uploaded_file.id)
models = client.models.list() resp = client.responses.create(
model="openai/gpt-4o",
# Select the first LLM and first embedding models input="How do you do great work? Use the existing knowledge_search tool.",
model_id = next(m for m in models if m.model_type == "llm").identifier tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
embedding_model_id = ( include=["file_search_call.results"],
em := next(m for m in models if m.model_type == "embedding")
).identifier
embedding_dimension = em.metadata["embedding_dimension"]
vector_db = client.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension,
provider_id="faiss",
)
vector_db_id = vector_db.identifier
source = "https://www.paulgraham.com/greatwork.html"
print("rag_tool> Ingesting document:", source)
document = RAGDocument(
document_id="document_1",
content=source,
mime_type="text/html",
metadata={},
)
client.tool_runtime.rag_tool.insert(
documents=[document],
vector_db_id=vector_db_id,
chunk_size_in_tokens=100,
)
agent = Agent(
client,
model=model_id,
instructions="You are a helpful assistant",
tools=[
{
"name": "builtin::rag/knowledge_search",
"args": {"vector_db_ids": [vector_db_id]},
}
],
) )
prompt = "How do you do great work?"
print("prompt>", prompt)
use_stream = True
response = agent.create_turn(
messages=[{"role": "user", "content": prompt}],
session_id=agent.create_session("rag_session"),
stream=use_stream,
)
# Only call `AgentEventLogger().log(response)` for streaming responses.
if use_stream:
for log in AgentEventLogger().log(response):
log.print()
else:
print(response)
```
We will use `uv` to run the script We will use `uv` to run the script
``` ```
uv run --with llama-stack-client,fire,requests demo_script.py uv run --with llama-stack-client,fire,requests demo_script.py
``` ```
And you should see output like below. And you should see output like below.
```python
>print(resp.output[1].content[0].text)
To do great work, consider the following principles:
1. **Follow Your Interests**: Engage in work that genuinely excites you. If you find an area intriguing, pursue it without being overly concerned about external pressures or norms. You should create things that you would want for yourself, as this often aligns with what others in your circle might want too.
2. **Work Hard on Ambitious Projects**: Ambition is vital, but it should be tempered by genuine interest. Instead of detailed planning for the future, focus on exciting projects that keep your options open. This approach, known as "staying upwind," allows for adaptability and can lead to unforeseen achievements.
3. **Choose Quality Colleagues**: Collaborating with talented colleagues can significantly affect your own work. Seek out individuals who offer surprising insights and whom you admire. The presence of good colleagues can elevate the quality of your work and inspire you.
4. **Maintain High Morale**: Your attitude towards work and life affects your performance. Cultivating optimism and viewing yourself as lucky rather than victimized can boost your productivity. Its essential to care for your physical health as well since it directly impacts your mental faculties and morale.
5. **Be Consistent**: Great work often comes from cumulative effort. Daily progress, even in small amounts, can result in substantial achievements over time. Emphasize consistency and make the work engaging, as this reduces the perceived burden of hard labor.
6. **Embrace Curiosity**: Curiosity is a driving force that can guide you in selecting fields of interest, pushing you to explore uncharted territories. Allow it to shape your work and continually seek knowledge and insights.
By focusing on these aspects, you can create an environment conducive to great work and personal fulfillment.
``` ```
rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
prompt> How do you do great work?
inference> [knowledge_search(query="What is the key to doing great work")]
tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
To further clarify, I would suggest that doing great work involves:
* Completing tasks with high quality and attention to detail
* Expanding on existing knowledge or ideas
* Making a positive impact on others through your work
* Striving for excellence and continuous improvement
Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
```
Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳 Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
:::tip HuggingFace access :::tip HuggingFace access

View file

@ -14,16 +14,18 @@ Meta's reference implementation of an agent system that can use tools, access ve
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `persistence_store` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | | | `persistence` | `<class 'inline.agents.meta_reference.config.AgentPersistenceConfig'>` | No | | |
| `responses_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
persistence_store: persistence:
type: sqlite agent_state:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/agents_store.db namespace: agents
responses_store: backend: kv_default
type: sqlite responses:
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/responses_store.db table_name: responses
backend: sql_default
max_write_queue_size: 10000
num_writers: 4
``` ```

View file

@ -14,7 +14,7 @@ Reference implementation of batches API with KVStore persistence.
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Configuration for the key-value store backend. | | `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Configuration for the key-value store backend. |
| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. | | `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. | | `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
@ -22,6 +22,6 @@ Reference implementation of batches API with KVStore persistence.
```yaml ```yaml
kvstore: kvstore:
type: sqlite namespace: batches
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/batches.db backend: kv_default
``` ```

View file

@ -14,12 +14,12 @@ Local filesystem-based dataset I/O provider for reading and writing datasets to
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | | | `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
kvstore: kvstore:
type: sqlite namespace: datasetio::localfs
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/localfs_datasetio.db backend: kv_default
``` ```

View file

@ -14,12 +14,12 @@ HuggingFace datasets provider for accessing and managing datasets from the Huggi
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | | | `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
kvstore: kvstore:
type: sqlite namespace: datasetio::huggingface
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/huggingface_datasetio.db backend: kv_default
``` ```

View file

@ -1,5 +1,7 @@
--- ---
description: "Llama Stack Evaluation API for running evaluations on model and agent candidates." description: "Evaluations
Llama Stack Evaluation API for running evaluations on model and agent candidates."
sidebar_label: Eval sidebar_label: Eval
title: Eval title: Eval
--- ---
@ -8,6 +10,8 @@ title: Eval
## Overview ## Overview
Llama Stack Evaluation API for running evaluations on model and agent candidates. Evaluations
Llama Stack Evaluation API for running evaluations on model and agent candidates.
This section contains documentation for all available providers for the **eval** API. This section contains documentation for all available providers for the **eval** API.

View file

@ -14,12 +14,12 @@ Meta's reference implementation of evaluation tasks with support for multiple la
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | | | `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
kvstore: kvstore:
type: sqlite namespace: eval
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db backend: kv_default
``` ```

View file

@ -240,6 +240,6 @@ additional_pip_packages:
- sqlalchemy[asyncio] - sqlalchemy[asyncio]
``` ```
No other steps are required other than `llama stack build` and `llama stack run`. The build process will use `module` to install all of the provider dependencies, retrieve the spec, etc. No other steps are required beyond installing dependencies with `llama stack list-deps <distro> | xargs -L1 uv pip install` and then running `llama stack run`. The CLI will use `module` to install the provider dependencies, retrieve the spec, etc.
The provider will now be available in Llama Stack with the type `remote::ramalama`. The provider will now be available in Llama Stack with the type `remote::ramalama`.

View file

@ -15,7 +15,7 @@ Local filesystem-based file storage provider for managing files and documents lo
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `storage_dir` | `<class 'str'>` | No | | Directory to store uploaded files | | `storage_dir` | `<class 'str'>` | No | | Directory to store uploaded files |
| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata | | `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No | | SQL store configuration for file metadata |
| `ttl_secs` | `<class 'int'>` | No | 31536000 | | | `ttl_secs` | `<class 'int'>` | No | 31536000 | |
## Sample Configuration ## Sample Configuration
@ -23,6 +23,6 @@ Local filesystem-based file storage provider for managing files and documents lo
```yaml ```yaml
storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/dummy/files} storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/dummy/files}
metadata_store: metadata_store:
type: sqlite table_name: files_metadata
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/files_metadata.db backend: sql_default
``` ```

View file

@ -20,7 +20,7 @@ AWS S3-based file storage provider for scalable cloud file management with metad
| `aws_secret_access_key` | `str \| None` | No | | AWS secret access key (optional if using IAM roles) | | `aws_secret_access_key` | `str \| None` | No | | AWS secret access key (optional if using IAM roles) |
| `endpoint_url` | `str \| None` | No | | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) | | `endpoint_url` | `str \| None` | No | | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
| `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist | | `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
| `metadata_store` | `utils.sqlstore.sqlstore.SqliteSqlStoreConfig \| utils.sqlstore.sqlstore.PostgresSqlStoreConfig` | No | sqlite | SQL store configuration for file metadata | | `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No | | SQL store configuration for file metadata |
## Sample Configuration ## Sample Configuration
@ -32,6 +32,6 @@ aws_secret_access_key: ${env.AWS_SECRET_ACCESS_KEY:=}
endpoint_url: ${env.S3_ENDPOINT_URL:=} endpoint_url: ${env.S3_ENDPOINT_URL:=}
auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false} auto_create_bucket: ${env.S3_AUTO_CREATE_BUCKET:=false}
metadata_store: metadata_store:
type: sqlite table_name: s3_files_metadata
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/s3_files_metadata.db backend: sql_default
``` ```

View file

@ -22,7 +22,6 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
## Provider Categories ## Provider Categories
- **[External Providers](external/index.mdx)** - Guide for building and using external providers - **[External Providers](external/index.mdx)** - Guide for building and using external providers
- **[OpenAI Compatibility](./openai.mdx)** - OpenAI API compatibility layer
- **[Inference](inference/index.mdx)** - LLM and embedding model providers - **[Inference](inference/index.mdx)** - LLM and embedding model providers
- **[Agents](agents/index.mdx)** - Agentic system providers - **[Agents](agents/index.mdx)** - Agentic system providers
- **[DatasetIO](datasetio/index.mdx)** - Dataset and data loader providers - **[DatasetIO](datasetio/index.mdx)** - Dataset and data loader providers
@ -31,3 +30,7 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
- **[Vector IO](vector_io/index.mdx)** - Vector database providers - **[Vector IO](vector_io/index.mdx)** - Vector database providers
- **[Tool Runtime](tool_runtime/index.mdx)** - Tool and protocol providers - **[Tool Runtime](tool_runtime/index.mdx)** - Tool and protocol providers
- **[Files](files/index.mdx)** - File system and storage providers - **[Files](files/index.mdx)** - File system and storage providers
## Other information about Providers
- **[OpenAI Compatibility](./openai.mdx)** - OpenAI API compatibility layer
- **[OpenAI-Compatible Responses Limitations](./openai_responses_limitations.mdx)** - Known limitations of the Responses API in Llama Stack

View file

@ -1,3 +1,4 @@
---
title: OpenAI Compatibility title: OpenAI Compatibility
description: OpenAI API Compatibility description: OpenAI API Compatibility
sidebar_label: OpenAI Compatibility sidebar_label: OpenAI Compatibility
@ -47,7 +48,7 @@ models = client.models.list()
#### Responses #### Responses
> **Note:** The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work. > **Note:** The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. See [Known Limitations of the OpenAI-compatible Responses API in Llama Stack](./openai_responses_limitations.mdx) for more details.
##### Simple inference ##### Simple inference

View file

@ -0,0 +1,301 @@
---
title: Known Limitations of the OpenAI-compatible Responses API in Llama Stack
description: Limitations of Responses API
sidebar_label: Limitations of Responses API
sidebar_position: 1
---
## Unresolved Issues
This document outlines known limitations and inconsistencies between Llama Stack's Responses API and OpenAI's Responses API. This comparison is based on OpenAI's API and reflects a comparison with the OpenAI APIs as of October 6, 2025 (OpenAI's client version `openai==1.107`).
See the OpenAI [changelog](https://platform.openai.com/docs/changelog) for details of any new functionality that has been added since that date. Links to issues are included so readers can read about status, post comments, and/or subscribe for updates relating to any limitations that are of specific interest to them. We would also love any other feedback on any use-cases you try that do not work to help prioritize the pieces left to implement.
Please open new issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work that does not already have an open issue.
### Instructions
**Status:** Partial Implementation + Work in Progress
**Issue:** [#3566](https://github.com/llamastack/llama-stack/issues/3566)
In Llama Stack, the instructions parameter is already implemented for creating a response, but it is not yet included in the output response object.
---
### Streaming
**Status:** Partial Implementation
**Issue:** [#2364](https://github.com/llamastack/llama-stack/issues/2364)
Streaming functionality for the Responses API is partially implemented and does work to some extent, but some streaming response objects that would be needed for full compatibility are still missing.
---
### Prompt Templates
**Status:** Partial Implementation
**Issue:** [#3321](https://github.com/llamastack/llama-stack/issues/3321)
OpenAI's platform supports [templated prompts using a structured language](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts). These templates can be stored server-side for organizational sharing. This feature is under development for Llama Stack.
---
### Web-search tool compatibility
**Status:** Partial Implementation
Both OpenAI and Llama Stack support a web-search built-in tool. The [OpenAI documentation](https://platform.openai.com/docs/api-reference/responses/create) for web search tool in a Responses tool list says:
> The type of the web search tool. One of `web_search` or `web_search_2025_08_26`.
In contrast, the [Llama Stack documentation](https://llamastack.github.io/docs/api/create-a-new-open-ai-response) says that the allowed values for `type` for web search are `MOD1`, `MOD2` and `MOD3`.
Is that correct? If so, what are the meanings of each of them? It might make sense for the allowed values for OpenAI map to some values for Llama Stack so that code written to the OpenAI specification
also work with Llama Stack.
The OpenAI web search tool also has fields for `filters` and `user_location` which are not documented as options for Llama Stack. If feasible, it would be good to support these too.
---
### Other built-in Tools
**Status:** Partial Implementation
OpenAI's Responses API includes an ecosystem of built-in tools (e.g., code interpreter) that lower the barrier to entry for agentic workflows. These tools are typically aligned with specific model training.
**Current Status in Llama Stack:**
- Some built-in tools exist (file search, web search)
- Missing tools include code interpreter, computer use, and image generation
- Some built-in tools may require additional APIs (e.g., [containers API](https://platform.openai.com/docs/api-reference/containers) for code interpreter)
It's unclear whether there is demand for additional built-in tools in Llama Stack. No upstream issues have been filed for adding more built-in tools.
---
### Response Branching
**Status:** Not Working
Response branching, as discussed in the [Agents vs OpenAI Responses API documentation](https://llamastack.github.io/docs/building_applications/responses_vs_agents), is not currently functional.
---
### Include
**Status:** Not Implemented
The `include` parameter allows you to provide a list of values that indicate additional information for the system to include in the model response. The [OpenAI API](https://platform.openai.com/docs/api-reference/responses/create) specifies the following allowed values for this parameter.
- `web_search_call.action.sources`
- `code_interpreter_call.outputs`
- `computer_call_output.output.image_url`
- `file_search_call.results`
- `message.input_image.image_url`
- `message.output_text.logprobs`
- `reasoning.encrypted_content`
Some of these are not relevant to Llama Stack in its current form. For example, code interpreter is not implemented (see "Built-in tools" below), so `code_interpreter_call.outputs` would not be a useful directive to Llama Stack.
However, others might be useful. For example, `message.output_text.logprobs` can be useful for assessing how confident a model is in each token of its output.
---
### Tool Choice
**Status:** Not Implemented
**Issue:** [#3548](https://github.com/llamastack/llama-stack/issues/3548)
In OpenAI's API, the `tool_choice` parameter allows you to set restrictions or requirements for which tools should be used when generating a response. This feature is not implemented in Llama Stack.
---
### Safety Identification and Tracking
**Status:** Not Implemented
OpenAI's platform allows users to track agentic users using a safety identifier passed with each response. When requests violate moderation or safety rules, account holders are alerted and automated actions can be taken. This capability is not currently available in Llama Stack.
---
### Connectors
**Status:** Not Implemented
Connectors are MCP servers maintained and managed by the Responses API provider. OpenAI has documented their connectors at [https://platform.openai.com/docs/guides/tools-connectors-mcp](https://platform.openai.com/docs/guides/tools-connectors-mcp).
**Open Questions:**
- Should Llama Stack include built-in support for some, all, or none of OpenAI's connectors?
- Should there be a mechanism for administrators to add custom connectors via `run.yaml` or an API?
---
### Reasoning
**Status:** Partially Implemented
The `reasoning` object in the output of Responses works for inference providers such as vLLM that output reasoning traces in chat completions requests. It does not work for other providers such as OpenAI's hosted service. See [#3551](https://github.com/llamastack/llama-stack/issues/3551) for more details.
---
### Service Tier
**Status:** Not Implemented
**Issue:** [#3550](https://github.com/llamastack/llama-stack/issues/3550)
Responses has a field `service_tier` that can be used to prioritize access to inference resources. Not all inference providers have such a concept, but Llama Stack pass through this value for those providers that do. Currently it does not.
---
### Top Logprobs
**Status:** Not Implemented
**Issue:** [#3552](https://github.com/llamastack/llama-stack/issues/3552)
The `top_logprobs` parameter from OpenAI's Responses API extends the functionality obtained by including `message.output_text.logprobs` in the `include` parameter list (as discussed in the Include section above).
It enables users to also get logprobs for alternative tokens.
---
### Max Tool Calls
**Status:** Not Implemented
**Issue:** [#3563](https://github.com/llamastack/llama-stack/issues/3563)
The Responses API can accept a `max_tool_calls` parameter that limits the number of tool calls allowed to be executed for a given response. This feature needs full implementation and documentation.
---
### Max Output Tokens
**Status:** Not Implemented
**Issue:** [#3562](https://github.com/llamastack/llama-stack/issues/3562)
The `max_output_tokens` field limits how many tokens the model is allowed to generate (for both reasoning and output combined). It is not implemented in Llama Stack.
---
### Incomplete Details
**Status:** Not Implemented
**Issue:** [#3567](https://github.com/llamastack/llama-stack/issues/3567)
The return object from a call to Responses includes a field for indicating why a response is incomplete if it is. For example, if the model stops generating because it has reached the specified max output tokens (see above), this field should be set to "IncompleteDetails(reason='max_output_tokens')". This is not implemented in Llama Stack.
---
### Metadata
**Status:** Not Implemented
**Issue:** [#3564](https://github.com/llamastack/llama-stack/issues/3564)
Metadata allows you to attach additional information to a response for your own reference and tracking. It is not implemented in Llama Stack.
---
### Background
**Status:** Not Implemented
**Issue:** [#3568](https://github.com/llamastack/llama-stack/issues/3568)
[Background mode](https://platform.openai.com/docs/guides/background) in OpenAI Responses lets you start a response generation job and then check back in on it later. This is useful if you might lose a connection during a generation and want to reconnect later and get the response back (for example if the client is running in a mobile app). It is not implemented in Llama Stack.
---
### Global Guardrails
**Status:** Feature Request
When calling the OpenAI Responses API, model outputs go through safety models configured by OpenAI administrators. Perhaps Llama Stack should provide a mechanism to configure safety models (or non-model logic) for all Responses requests, either through `run.yaml` or an administrative API.
---
### User-Controlled Guardrails
**Status:** Feature Request
**Issue:** [#3325](https://github.com/llamastack/llama-stack/issues/3325)
OpenAI has not released a way for users to configure their own guardrails. However, Llama Stack users may want this capability to complement or replace global guardrails. This could be implemented as a non-breaking, additive difference from the OpenAI API.
---
### MCP Elicitations
**Status:** Unknown
Elicitations allow MCP servers to request additional information from users through the client during interactions (e.g., a tool requesting a username before proceeding).
See the [MCP specification](https://modelcontextprotocol.io/specification/draft/client/elicitation) for details.
**Open Questions:**
- Does this work in OpenAI's Responses API reference implementation?
- If not, is there a reasonable way to make that work within the API as is? Or would the API need to change?
- Does this work in Llama Stack?
---
### MCP Sampling
**Status:** Unknown
Sampling allows MCP tools to query the generative AI model. See the [MCP specification](https://modelcontextprotocol.io/specification/draft/client/sampling) for details.
**Open Questions:**
- Does this work in OpenAI's Responses API reference implementation?
- If not, is there a reasonable way to make that work within the API as is? Or would the API need to change?
- Does this work in Llama Stack?
### Prompt Caching
**Status:** Unknown
OpenAI provides a [prompt caching](https://platform.openai.com/docs/guides/prompt-caching) mechanism in Responses that is enabled for its most recent models.
**Open Questions:**
- Does this work in Llama Stack?
- If not, is there a reasonable way to make that work for those inference providers that have this capability by passing through the provided `prompt_cache_key` to the inference provider?
- Is there a reasonable way to make that work for inference providers that don't build in this capability by doing some sort of caching at the Llama Stack layer?
---
### Parallel Tool Calls
**Status:** Rumored Issue
There are reports that `parallel_tool_calls` may not work correctly. This needs verification and a ticket should be opened if confirmed.
---
## Resolved Issues
The following limitations have been addressed in recent releases:
### MCP and Function Tools with No Arguments
**Status:** ✅ Resolved
MCP and function tools now work correctly even when they have no arguments.
---
### `require_approval` Parameter for MCP Tools
**Status:** ✅ Resolved
The `require_approval` parameter for MCP tools in the Responses API now works correctly.
---
### MCP Tools with Array-Type Arguments
**Status:** ✅ Resolved
**Fixed in:** [#3003](https://github.com/llamastack/llama-stack/pull/3003) (Agent API), [#3602](https://github.com/llamastack/llama-stack/pull/3602) (Responses API)
MCP tools now correctly handle array-type arguments in both the Agent API and Responses API.

View file

@ -16,14 +16,12 @@ Meta's reference implementation of telemetry and observability using OpenTelemet
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `otel_exporter_otlp_endpoint` | `str \| None` | No | | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. | | `otel_exporter_otlp_endpoint` | `str \| None` | No | | The OpenTelemetry collector endpoint URL (base URL for traces, metrics, and logs). If not set, the SDK will use OTEL_EXPORTER_OTLP_ENDPOINT environment variable. |
| `service_name` | `<class 'str'>` | No | | The service name to use for telemetry | | `service_name` | `<class 'str'>` | No | | The service name to use for telemetry |
| `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [&lt;TelemetrySink.SQLITE: 'sqlite'&gt;] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, sqlite, console) | | `sinks` | `list[inline.telemetry.meta_reference.config.TelemetrySink` | No | [] | List of telemetry sinks to enable (possible values: otel_trace, otel_metric, console) |
| `sqlite_db_path` | `<class 'str'>` | No | ~/.llama/runtime/trace_store.db | The path to the SQLite database to use for storing traces |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=sqlite} sinks: ${env.TELEMETRY_SINKS:=}
sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/trace_store.db
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=} otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
``` ```

View file

@ -79,13 +79,13 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | | | | `db_path` | `<class 'str'>` | No | | |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
db_path: ${env.CHROMADB_PATH} db_path: ${env.CHROMADB_PATH}
kvstore: persistence:
type: sqlite namespace: vector_io::chroma
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/chroma_inline_registry.db backend: kv_default
``` ```

View file

@ -95,12 +95,12 @@ more details about Faiss in general.
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
kvstore: persistence:
type: sqlite namespace: vector_io::faiss
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db backend: kv_default
``` ```

View file

@ -14,14 +14,14 @@ Meta's reference implementation of a vector database.
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
kvstore: persistence:
type: sqlite namespace: vector_io::faiss
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/faiss_store.db backend: kv_default
``` ```
## Deprecation Notice ## Deprecation Notice

View file

@ -17,14 +17,14 @@ Please refer to the remote provider documentation.
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | | | | `db_path` | `<class 'str'>` | No | | |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend (SQLite only for now) |
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server | | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
db_path: ${env.MILVUS_DB_PATH:=~/.llama/dummy}/milvus.db db_path: ${env.MILVUS_DB_PATH:=~/.llama/dummy}/milvus.db
kvstore: persistence:
type: sqlite namespace: vector_io::milvus
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_registry.db backend: kv_default
``` ```

View file

@ -98,13 +98,13 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `path` | `<class 'str'>` | No | | | | `path` | `<class 'str'>` | No | | |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
path: ${env.QDRANT_PATH:=~/.llama/~/.llama/dummy}/qdrant.db path: ${env.QDRANT_PATH:=~/.llama/~/.llama/dummy}/qdrant.db
kvstore: persistence:
type: sqlite namespace: vector_io::qdrant
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/qdrant_registry.db backend: kv_default
``` ```

View file

@ -408,13 +408,13 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | | Path to the SQLite database file | | `db_path` | `<class 'str'>` | No | | Path to the SQLite database file |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend (SQLite only for now) |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
kvstore: persistence:
type: sqlite namespace: vector_io::sqlite_vec
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db backend: kv_default
``` ```

View file

@ -17,15 +17,15 @@ Please refer to the sqlite-vec provider documentation.
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | | Path to the SQLite database file | | `db_path` | `<class 'str'>` | No | | Path to the SQLite database file |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend (SQLite only for now) |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
kvstore: persistence:
type: sqlite namespace: vector_io::sqlite_vec
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db backend: kv_default
``` ```
## Deprecation Notice ## Deprecation Notice

View file

@ -78,13 +78,13 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `url` | `str \| None` | No | | | | `url` | `str \| None` | No | | |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
url: ${env.CHROMADB_URL} url: ${env.CHROMADB_URL}
kvstore: persistence:
type: sqlite namespace: vector_io::chroma_remote
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/chroma_remote_registry.db backend: kv_default
``` ```

View file

@ -408,7 +408,7 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
| `uri` | `<class 'str'>` | No | | The URI of the Milvus server | | `uri` | `<class 'str'>` | No | | The URI of the Milvus server |
| `token` | `str \| None` | No | | The token of the Milvus server | | `token` | `str \| None` | No | | The token of the Milvus server |
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server | | `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend |
| `config` | `dict` | No | `{}` | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. | | `config` | `dict` | No | `{}` | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
:::note :::note
@ -420,7 +420,7 @@ This configuration class accepts additional fields beyond those listed above. Yo
```yaml ```yaml
uri: ${env.MILVUS_ENDPOINT} uri: ${env.MILVUS_ENDPOINT}
token: ${env.MILVUS_TOKEN} token: ${env.MILVUS_TOKEN}
kvstore: persistence:
type: sqlite namespace: vector_io::milvus_remote
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_remote_registry.db backend: kv_default
``` ```

View file

@ -218,7 +218,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
| `db` | `str \| None` | No | postgres | | | `db` | `str \| None` | No | postgres | |
| `user` | `str \| None` | No | postgres | | | `user` | `str \| None` | No | postgres | |
| `password` | `str \| None` | No | mysecretpassword | | | `password` | `str \| None` | No | mysecretpassword | |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No | | Config for KV store backend (SQLite only for now) | | `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No | | Config for KV store backend (SQLite only for now) |
## Sample Configuration ## Sample Configuration
@ -228,7 +228,7 @@ port: ${env.PGVECTOR_PORT:=5432}
db: ${env.PGVECTOR_DB} db: ${env.PGVECTOR_DB}
user: ${env.PGVECTOR_USER} user: ${env.PGVECTOR_USER}
password: ${env.PGVECTOR_PASSWORD} password: ${env.PGVECTOR_PASSWORD}
kvstore: persistence:
type: sqlite namespace: vector_io::pgvector
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/pgvector_registry.db backend: kv_default
``` ```

View file

@ -26,13 +26,13 @@ Please refer to the inline provider documentation.
| `prefix` | `str \| None` | No | | | | `prefix` | `str \| None` | No | | |
| `timeout` | `int \| None` | No | | | | `timeout` | `int \| None` | No | | |
| `host` | `str \| None` | No | | | | `host` | `str \| None` | No | | |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | | | `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
api_key: ${env.QDRANT_API_KEY:=} api_key: ${env.QDRANT_API_KEY:=}
kvstore: persistence:
type: sqlite namespace: vector_io::qdrant_remote
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/qdrant_registry.db backend: kv_default
``` ```

View file

@ -75,14 +75,14 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `weaviate_api_key` | `str \| None` | No | | The API key for the Weaviate instance | | `weaviate_api_key` | `str \| None` | No | | The API key for the Weaviate instance |
| `weaviate_cluster_url` | `str \| None` | No | localhost:8080 | The URL of the Weaviate cluster | | `weaviate_cluster_url` | `str \| None` | No | localhost:8080 | The URL of the Weaviate cluster |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No | | Config for KV store backend (SQLite only for now) | | `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No | | Config for KV store backend (SQLite only for now) |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
weaviate_api_key: null weaviate_api_key: null
weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080} weaviate_cluster_url: ${env.WEAVIATE_CLUSTER_URL:=localhost:8080}
kvstore: persistence:
type: sqlite namespace: vector_io::weaviate
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/weaviate_registry.db backend: kv_default
``` ```

View file

@ -32,7 +32,6 @@ Commands:
scoring_functions Manage scoring functions. scoring_functions Manage scoring functions.
shields Manage safety shield services. shields Manage safety shield services.
toolgroups Manage available tool groups. toolgroups Manage available tool groups.
vector_dbs Manage vector databases.
``` ```
### `llama-stack-client configure` ### `llama-stack-client configure`
@ -211,53 +210,6 @@ Unregister a model from distribution endpoint
llama-stack-client models unregister <model_id> llama-stack-client models unregister <model_id>
``` ```
## Vector DB Management
Manage vector databases.
### `llama-stack-client vector_dbs list`
Show available vector dbs on distribution endpoint
```bash
llama-stack-client vector_dbs list
```
```
┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ identifier ┃ provider_id ┃ provider_resource_id ┃ vector_db_type ┃ params ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ my_demo_vector_db │ faiss │ my_demo_vector_db │ │ embedding_dimension: 768 │
│ │ │ │ │ embedding_model: nomic-embed-text-v1.5 │
│ │ │ │ │ type: vector_db │
│ │ │ │ │ │
└──────────────────────────┴─────────────┴──────────────────────────┴────────────────┴───────────────────────────────────┘
```
### `llama-stack-client vector_dbs register`
Create a new vector db
```bash
llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
```
Required arguments:
- `VECTOR_DB_ID`: Vector DB ID
Optional arguments:
- `--provider-id`: Provider ID for the vector db
- `--provider-vector-db-id`: Provider's vector db ID
- `--embedding-model`: Embedding model to use. Default: `nomic-embed-text-v1.5`
- `--embedding-dimension`: Dimension of embeddings. Default: 768
### `llama-stack-client vector_dbs unregister`
Delete a vector db
```bash
llama-stack-client vector_dbs unregister <vector-db-id>
```
Required arguments:
- `VECTOR_DB_ID`: Vector DB ID
## Shield Management ## Shield Management
Manage safety shield services. Manage safety shield services.
### `llama-stack-client shields list` ### `llama-stack-client shields list`

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -2864,7 +2864,7 @@
} }
], ],
"source": [ "source": [
"!llama stack build --distro experimental-post-training --image-type venv --image-name __system__" "!llama stack list-deps experimental-post-training | xargs -L1 uv pip install"
] ]
}, },
{ {

View file

@ -38,7 +38,7 @@
"source": [ "source": [
"# NBVAL_SKIP\n", "# NBVAL_SKIP\n",
"!pip install -U llama-stack\n", "!pip install -U llama-stack\n",
"!UV_SYSTEM_PYTHON=1 llama stack build --distro fireworks --image-type venv" "llama stack list-deps fireworks | xargs -L1 uv pip install\n"
] ]
}, },
{ {

File diff suppressed because it is too large Load diff

View file

@ -136,7 +136,8 @@
" \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n", " \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n", " log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n", " process = subprocess.Popen(\n",
" \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n", " \"uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install\",\n",
" \"uv run --with llama-stack llama stack run starter\",\n",
" shell=True,\n", " shell=True,\n",
" stdout=log_file,\n", " stdout=log_file,\n",
" stderr=log_file,\n", " stderr=log_file,\n",
@ -172,7 +173,7 @@
"\n", "\n",
"def kill_llama_stack_server():\n", "def kill_llama_stack_server():\n",
" # Kill any existing llama stack server processes using pkill command\n", " # Kill any existing llama stack server processes using pkill command\n",
" os.system(\"pkill -f llama_stack.core.server.server\")" " os.system(\"pkill -f llama_stack.core.server.server\")\n"
] ]
}, },
{ {

View file

@ -105,7 +105,8 @@
" \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n", " \"\"\"Build and run LlamaStack server in one step using --run flag\"\"\"\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n", " log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n", " process = subprocess.Popen(\n",
" \"uv run --with llama-stack llama stack build --distro starter --image-type venv --run\",\n", " \"uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install\",\n",
" \"uv run --with llama-stack llama stack run starter\",\n",
" shell=True,\n", " shell=True,\n",
" stdout=log_file,\n", " stdout=log_file,\n",
" stderr=log_file,\n", " stderr=log_file,\n",

View file

@ -92,7 +92,7 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"```bash\n", "```bash\n",
"LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n", "uv run --with llama-stack llama stack list-deps nvidia | xargs -L1 uv pip install\n",
"```" "```"
] ]
}, },

View file

@ -81,7 +81,7 @@
"metadata": {}, "metadata": {},
"source": [ "source": [
"```bash\n", "```bash\n",
"LLAMA_STACK_DIR=$(pwd) llama stack build --distro nvidia --image-type venv\n", "uv run --with llama-stack llama stack list-deps nvidia | xargs -L1 uv pip install\n",
"```" "```"
] ]
}, },

View file

@ -30,3 +30,5 @@ fi
stack_dir=$(dirname $(dirname $THIS_DIR)) stack_dir=$(dirname $(dirname $THIS_DIR))
PYTHONPATH=$PYTHONPATH:$stack_dir \ PYTHONPATH=$PYTHONPATH:$stack_dir \
python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/static python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/static
cp $stack_dir/docs/static/stainless-llama-stack-spec.yaml $stack_dir/client-sdks/stainless/openapi.yml

View file

@ -1,366 +1,399 @@
{ {
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "c1e7571c", "id": "c1e7571c",
"metadata": { "metadata": {
"id": "c1e7571c" "id": "c1e7571c"
}, },
"source": [ "source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
"\n", "\n",
"# Llama Stack - Building AI Applications\n", "# Llama Stack - Building AI Applications\n",
"\n", "\n",
"<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n", "<img src=\"https://llamastack.github.io/latest/_images/llama-stack.png\" alt=\"drawing\" width=\"500\"/>\n",
"\n", "\n",
"Get started with Llama Stack in minutes!\n", "Get started with Llama Stack in minutes!\n",
"\n", "\n",
"[Llama Stack](https://github.com/meta-llama/llama-stack) is a stateful service with REST APIs to support the seamless transition of AI applications across different environments. You can build and test using a local server first and deploy to a hosted endpoint for production.\n", "[Llama Stack](https://github.com/meta-llama/llama-stack) is a stateful service with REST APIs to support the seamless transition of AI applications across different environments. You can build and test using a local server first and deploy to a hosted endpoint for production.\n",
"\n", "\n",
"In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)\n", "In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)\n",
"as the inference [provider](docs/source/providers/index.md#inference) for a Llama Model.\n" "as the inference [provider](docs/source/providers/index.md#inference) for a Llama Model.\n"
] ]
},
{
"cell_type": "markdown",
"id": "4CV1Q19BDMVw",
"metadata": {
"id": "4CV1Q19BDMVw"
},
"source": [
"## Step 1: Install and setup"
]
},
{
"cell_type": "markdown",
"id": "K4AvfUAJZOeS",
"metadata": {
"id": "K4AvfUAJZOeS"
},
"source": [
"### 1.1. Install uv and test inference with Ollama\n",
"\n",
"We'll install [uv](https://docs.astral.sh/uv/) to setup the Python virtual environment, along with [colab-xterm](https://github.com/InfuseAI/colab-xterm) for running command-line tools, and [Ollama](https://ollama.com/download) as the inference provider."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a2d7b85",
"metadata": {},
"outputs": [],
"source": [
"%pip install uv llama_stack llama-stack-client\n",
"\n",
"## If running on Collab:\n",
"# !pip install colab-xterm\n",
"# %load_ext colabxterm\n",
"\n",
"!curl https://ollama.ai/install.sh | sh"
]
},
{
"cell_type": "markdown",
"id": "39fa584b",
"metadata": {},
"source": [
"### 1.2. Test inference with Ollama"
]
},
{
"cell_type": "markdown",
"id": "3bf81522",
"metadata": {},
"source": [
"Well now launch a terminal and run inference on a Llama model with Ollama to verify that the model is working correctly."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7e8e0f1",
"metadata": {},
"outputs": [],
"source": [
"## If running on Colab:\n",
"# %xterm\n",
"\n",
"## To be ran in the terminal:\n",
"# ollama serve &\n",
"# ollama run llama3.2:3b --keepalive 60m"
]
},
{
"cell_type": "markdown",
"id": "f3c5f243",
"metadata": {},
"source": [
"If successful, you should see the model respond to a prompt.\n",
"\n",
"...\n",
"```\n",
">>> hi\n",
"Hello! How can I assist you today?\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "oDUB7M_qe-Gs",
"metadata": {
"id": "oDUB7M_qe-Gs"
},
"source": [
"## Step 2: Run the Llama Stack server\n",
"\n",
"In this showcase, we will start a Llama Stack server that is running locally."
]
},
{
"cell_type": "markdown",
"id": "732eadc6",
"metadata": {},
"source": [
"### 2.1. Setup the Llama Stack Server"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "J2kGed0R5PSf",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"collapsed": true,
"id": "J2kGed0R5PSf",
"outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
},
"outputs": [],
"source": [
"import os\n",
"import subprocess\n",
"\n",
"if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
"\n",
"# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
"!uv run --with llama-stack llama stack build --distro starter\n",
"\n",
"def run_llama_stack_server_background():\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n",
" f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\n",
" shell=True,\n",
" stdout=log_file,\n",
" stderr=log_file,\n",
" text=True\n",
" )\n",
"\n",
" print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
" return process\n",
"\n",
"def wait_for_server_to_start():\n",
" import requests\n",
" from requests.exceptions import ConnectionError\n",
" import time\n",
"\n",
" url = \"http://0.0.0.0:8321/v1/health\"\n",
" max_retries = 30\n",
" retry_interval = 1\n",
"\n",
" print(\"Waiting for server to start\", end=\"\")\n",
" for _ in range(max_retries):\n",
" try:\n",
" response = requests.get(url)\n",
" if response.status_code == 200:\n",
" print(\"\\nServer is ready!\")\n",
" return True\n",
" except ConnectionError:\n",
" print(\".\", end=\"\", flush=True)\n",
" time.sleep(retry_interval)\n",
"\n",
" print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
" return False\n",
"\n",
"\n",
"# use this helper if needed to kill the server\n",
"def kill_llama_stack_server():\n",
" # Kill any existing llama stack server processes\n",
" os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
]
},
{
"cell_type": "markdown",
"id": "c40e9efd",
"metadata": {},
"source": [
"### 2.2. Start the Llama Stack Server"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f779283d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting Llama Stack server with PID: 787100\n",
"Waiting for server to start\n",
"Server is ready!\n"
]
}
],
"source": [
"server_process = run_llama_stack_server_background()\n",
"assert wait_for_server_to_start()"
]
},
{
"cell_type": "markdown",
"id": "28477c03",
"metadata": {},
"source": [
"## Step 3: Run the demo"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7da71011",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html\n",
"prompt> How do you do great work?\n",
"\u001b[33minference> \u001b[0m\u001b[33m[k\u001b[0m\u001b[33mnowledge\u001b[0m\u001b[33m_search\u001b[0m\u001b[33m(query\u001b[0m\u001b[33m=\"\u001b[0m\u001b[33mWhat\u001b[0m\u001b[33m is\u001b[0m\u001b[33m the\u001b[0m\u001b[33m key\u001b[0m\u001b[33m to\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m\")]\u001b[0m\u001b[97m\u001b[0m\n",
"\u001b[32mtool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}\u001b[0m\n",
"\u001b[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\\nBEGIN of knowledge_search tool results.\\n', type='text'), TextContentItem(text=\"Result 1:\\nDocument_id:docum\\nContent: work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 2:\\nDocument_id:docum\\nContent: work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 3:\\nDocument_id:docum\\nContent: work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 4:\\nDocument_id:docum\\nContent: work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text=\"Result 5:\\nDocument_id:docum\\nContent: work. Doing great work means doing something important\\nso well that you expand people's ideas of what's possible. But\\nthere's no threshold for importance. It's a matter of degree, and\\noften hard to judge at the time anyway.\\n\", type='text'), TextContentItem(text='END of knowledge_search tool results.\\n', type='text'), TextContentItem(text='The above results were retrieved to help answer the user\\'s query: \"What is the key to doing great work\". Use them as supporting information only in answering this query.\\n', type='text')]\u001b[0m\n",
"\u001b[33minference> \u001b[0m\u001b[33mDoing\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m means\u001b[0m\u001b[33m doing\u001b[0m\u001b[33m something\u001b[0m\u001b[33m important\u001b[0m\u001b[33m so\u001b[0m\u001b[33m well\u001b[0m\u001b[33m that\u001b[0m\u001b[33m you\u001b[0m\u001b[33m expand\u001b[0m\u001b[33m people\u001b[0m\u001b[33m's\u001b[0m\u001b[33m ideas\u001b[0m\u001b[33m of\u001b[0m\u001b[33m what\u001b[0m\u001b[33m's\u001b[0m\u001b[33m possible\u001b[0m\u001b[33m.\u001b[0m\u001b[33m However\u001b[0m\u001b[33m,\u001b[0m\u001b[33m there\u001b[0m\u001b[33m's\u001b[0m\u001b[33m no\u001b[0m\u001b[33m threshold\u001b[0m\u001b[33m for\u001b[0m\u001b[33m importance\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m's\u001b[0m\u001b[33m often\u001b[0m\u001b[33m hard\u001b[0m\u001b[33m to\u001b[0m\u001b[33m judge\u001b[0m\u001b[33m at\u001b[0m\u001b[33m the\u001b[0m\u001b[33m time\u001b[0m\u001b[33m anyway\u001b[0m\u001b[33m.\u001b[0m\u001b[33m Great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m is\u001b[0m\u001b[33m a\u001b[0m\u001b[33m matter\u001b[0m\u001b[33m of\u001b[0m\u001b[33m degree\u001b[0m\u001b[33m,\u001b[0m\u001b[33m and\u001b[0m\u001b[33m it\u001b[0m\u001b[33m can\u001b[0m\u001b[33m be\u001b[0m\u001b[33m difficult\u001b[0m\u001b[33m to\u001b[0m\u001b[33m determine\u001b[0m\u001b[33m whether\u001b[0m\u001b[33m someone\u001b[0m\u001b[33m has\u001b[0m\u001b[33m done\u001b[0m\u001b[33m great\u001b[0m\u001b[33m work\u001b[0m\u001b[33m until\u001b[0m\u001b[33m after\u001b[0m\u001b[33m the\u001b[0m\u001b[33m fact\u001b[0m\u001b[33m.\u001b[0m\u001b[97m\u001b[0m\n",
"\u001b[30m\u001b[0m"
]
}
],
"source": [
"from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
"\n",
"vector_db_id = \"my_demo_vector_db\"\n",
"client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
"\n",
"models = client.models.list()\n",
"\n",
"# Select the first ollama and first ollama's embedding model\n",
"model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
"embedding_model = next(m for m in models if m.model_type == \"embedding\" and m.provider_id == \"ollama\")\n",
"embedding_model_id = embedding_model.identifier\n",
"embedding_dimension = embedding_model.metadata[\"embedding_dimension\"]\n",
"\n",
"_ = client.vector_dbs.register(\n",
" vector_db_id=vector_db_id,\n",
" embedding_model=embedding_model_id,\n",
" embedding_dimension=embedding_dimension,\n",
" provider_id=\"faiss\",\n",
")\n",
"source = \"https://www.paulgraham.com/greatwork.html\"\n",
"print(\"rag_tool> Ingesting document:\", source)\n",
"document = RAGDocument(\n",
" document_id=\"document_1\",\n",
" content=source,\n",
" mime_type=\"text/html\",\n",
" metadata={},\n",
")\n",
"client.tool_runtime.rag_tool.insert(\n",
" documents=[document],\n",
" vector_db_id=vector_db_id,\n",
" chunk_size_in_tokens=50,\n",
")\n",
"agent = Agent(\n",
" client,\n",
" model=model_id,\n",
" instructions=\"You are a helpful assistant\",\n",
" tools=[\n",
" {\n",
" \"name\": \"builtin::rag/knowledge_search\",\n",
" \"args\": {\"vector_db_ids\": [vector_db_id]},\n",
" }\n",
" ],\n",
")\n",
"\n",
"prompt = \"How do you do great work?\"\n",
"print(\"prompt>\", prompt)\n",
"\n",
"response = agent.create_turn(\n",
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
" session_id=agent.create_session(\"rag_session\"),\n",
" stream=True,\n",
")\n",
"\n",
"for log in AgentEventLogger().log(response):\n",
" log.print()"
]
},
{
"cell_type": "markdown",
"id": "341aaadf",
"metadata": {},
"source": [
"Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳"
]
},
{
"cell_type": "markdown",
"id": "e88e1185",
"metadata": {},
"source": [
"## Next Steps"
]
},
{
"cell_type": "markdown",
"id": "bcb73600",
"metadata": {},
"source": [
"Now you're ready to dive deeper into Llama Stack!\n",
"- Explore the [Detailed Tutorial](./detailed_tutorial.md).\n",
"- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).\n",
"- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).\n",
"- Learn about Llama Stack [Concepts](../concepts/index.md).\n",
"- Discover how to [Build Llama Stacks](../distributions/index.md).\n",
"- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.\n",
"- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials."
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
}, },
"nbformat": 4, {
"nbformat_minor": 5 "cell_type": "markdown",
"id": "4CV1Q19BDMVw",
"metadata": {
"id": "4CV1Q19BDMVw"
},
"source": [
"## Step 1: Install and setup"
]
},
{
"cell_type": "markdown",
"id": "K4AvfUAJZOeS",
"metadata": {
"id": "K4AvfUAJZOeS"
},
"source": [
"### 1.1. Install uv and test inference with Ollama\n",
"\n",
"We'll install [uv](https://docs.astral.sh/uv/) to setup the Python virtual environment, along with [colab-xterm](https://github.com/InfuseAI/colab-xterm) for running command-line tools, and [Ollama](https://ollama.com/download) as the inference provider."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a2d7b85",
"metadata": {},
"outputs": [],
"source": [
"%pip install uv llama_stack llama-stack-client\n",
"\n",
"## If running on Collab:\n",
"# !pip install colab-xterm\n",
"# %load_ext colabxterm\n",
"\n",
"!curl https://ollama.ai/install.sh | sh"
]
},
{
"cell_type": "markdown",
"id": "39fa584b",
"metadata": {},
"source": [
"### 1.2. Test inference with Ollama"
]
},
{
"cell_type": "markdown",
"id": "3bf81522",
"metadata": {},
"source": [
"Well now launch a terminal and run inference on a Llama model with Ollama to verify that the model is working correctly."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7e8e0f1",
"metadata": {},
"outputs": [],
"source": [
"## If running on Colab:\n",
"# %xterm\n",
"\n",
"## To be ran in the terminal:\n",
"# ollama serve &\n",
"# ollama run llama3.2:3b --keepalive 60m"
]
},
{
"cell_type": "markdown",
"id": "f3c5f243",
"metadata": {},
"source": [
"If successful, you should see the model respond to a prompt.\n",
"\n",
"...\n",
"```\n",
">>> hi\n",
"Hello! How can I assist you today?\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "oDUB7M_qe-Gs",
"metadata": {
"id": "oDUB7M_qe-Gs"
},
"source": [
"## Step 2: Run the Llama Stack server\n",
"\n",
"In this showcase, we will start a Llama Stack server that is running locally."
]
},
{
"cell_type": "markdown",
"id": "732eadc6",
"metadata": {},
"source": [
"### 2.1. Setup the Llama Stack Server"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "J2kGed0R5PSf",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "J2kGed0R5PSf",
"outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
"\u001b[2mAudited \u001b[1m52 packages\u001b[0m \u001b[2min 1.56s\u001b[0m\u001b[0m\n",
"\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
"\u001b[2mAudited \u001b[1m3 packages\u001b[0m \u001b[2min 122ms\u001b[0m\u001b[0m\n",
"\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
"\u001b[2mAudited \u001b[1m3 packages\u001b[0m \u001b[2min 197ms\u001b[0m\u001b[0m\n",
"\u001b[2mUsing Python 3.12.12 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/test\u001b[0m\n",
"\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 11ms\u001b[0m\u001b[0m\n"
]
}
],
"source": [
"import os\n",
"import subprocess\n",
"\n",
"if \"UV_SYSTEM_PYTHON\" in os.environ:\n",
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
"\n",
"# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
"!uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install\n",
"\n",
"def run_llama_stack_server_background():\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n",
" f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\",\n",
" shell=True,\n",
" stdout=log_file,\n",
" stderr=log_file,\n",
" text=True\n",
" )\n",
"\n",
" print(f\"Starting Llama Stack server with PID: {process.pid}\")\n",
" return process\n",
"\n",
"def wait_for_server_to_start():\n",
" import requests\n",
" from requests.exceptions import ConnectionError\n",
" import time\n",
"\n",
" url = \"http://0.0.0.0:8321/v1/health\"\n",
" max_retries = 30\n",
" retry_interval = 1\n",
"\n",
" print(\"Waiting for server to start\", end=\"\")\n",
" for _ in range(max_retries):\n",
" try:\n",
" response = requests.get(url)\n",
" if response.status_code == 200:\n",
" print(\"\\nServer is ready!\")\n",
" return True\n",
" except ConnectionError:\n",
" print(\".\", end=\"\", flush=True)\n",
" time.sleep(retry_interval)\n",
"\n",
" print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n",
" return False\n",
"\n",
"\n",
"# use this helper if needed to kill the server\n",
"def kill_llama_stack_server():\n",
" # Kill any existing llama stack server processes\n",
" os.system(\"ps aux | grep -v grep | grep llama_stack.core.server.server | awk '{print $2}' | xargs kill -9\")\n"
]
},
{
"cell_type": "markdown",
"id": "c40e9efd",
"metadata": {},
"source": [
"### 2.2. Start the Llama Stack Server"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f779283d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting Llama Stack server with PID: 20778\n",
"Waiting for server to start........\n",
"Server is ready!\n"
]
}
],
"source": [
"server_process = run_llama_stack_server_background()\n",
"assert wait_for_server_to_start()"
]
},
{
"cell_type": "markdown",
"id": "28477c03",
"metadata": {},
"source": [
"## Step 3: Run the demo"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7da71011",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:httpx:HTTP Request: GET http://0.0.0.0:8321/v1/models \"HTTP/1.1 200 OK\"\n",
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/files \"HTTP/1.1 200 OK\"\n",
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/vector_stores \"HTTP/1.1 200 OK\"\n",
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/conversations \"HTTP/1.1 200 OK\"\n",
"INFO:httpx:HTTP Request: POST http://0.0.0.0:8321/v1/responses \"HTTP/1.1 200 OK\"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"prompt> How do you do great work?\n",
"🤔 Doing great work involves a combination of skills, habits, and mindsets. Here are some key principles:\n",
"\n",
"1. **Set Clear Goals**: Start with a clear vision of what you want to achieve. Define specific, measurable, achievable, relevant, and time-bound (SMART) goals.\n",
"\n",
"2. **Plan and Prioritize**: Break your goals into smaller, manageable tasks. Prioritize these tasks based on their importance and urgency.\n",
"\n",
"3. **Focus on Quality**: Aim for high-quality outcomes rather than just finishing tasks. Pay attention to detail, and ensure your work meets or exceeds standards.\n",
"\n",
"4. **Stay Organized**: Keep your workspace, both physical and digital, organized to help you stay focused and efficient.\n",
"\n",
"5. **Manage Your Time**: Use time management techniques such as the Pomodoro Technique, time blocking, or the Eisenhower Box to maximize productivity.\n",
"\n",
"6. **Seek Feedback and Learn**: Regularly seek feedback from peers, mentors, or supervisors. Use constructive criticism to improve continuously.\n",
"\n",
"7. **Innovate and Improve**: Look for ways to improve processes or introduce new ideas. Be open to change and willing to adapt.\n",
"\n",
"8. **Stay Motivated and Persistent**: Keep your end goals in mind to stay motivated. Overcome setbacks with resilience and persistence.\n",
"\n",
"9. **Balance and Rest**: Ensure you maintain a healthy work-life balance. Take breaks and manage stress to sustain long-term productivity.\n",
"\n",
"10. **Reflect and Adjust**: Regularly assess your progress and adjust your strategies as needed. Reflect on what works well and what doesn't.\n",
"\n",
"By incorporating these elements, you can consistently produce high-quality work and achieve excellence in your endeavors.\n"
]
}
],
"source": [
"from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
"import requests\n",
"\n",
"vector_store_id = \"my_demo_vector_db\"\n",
"client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
"\n",
"models = client.models.list()\n",
"\n",
"# Select the first ollama and first ollama's embedding model\n",
"model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
"\n",
"\n",
"source = \"https://www.paulgraham.com/greatwork.html\"\n",
"response = requests.get(source)\n",
"file = client.files.create(\n",
" file=response.content,\n",
" purpose='assistants'\n",
")\n",
"vector_store = client.vector_stores.create(\n",
" name=vector_store_id,\n",
" file_ids=[file.id],\n",
")\n",
"\n",
"agent = Agent(\n",
" client,\n",
" model=model_id,\n",
" instructions=\"You are a helpful assistant\",\n",
" tools=[\n",
" {\n",
" \"type\": \"file_search\",\n",
" \"vector_store_ids\": [vector_store_id],\n",
" }\n",
" ],\n",
")\n",
"\n",
"prompt = \"How do you do great work?\"\n",
"print(\"prompt>\", prompt)\n",
"\n",
"response = agent.create_turn(\n",
" messages=[{\"role\": \"user\", \"content\": prompt}],\n",
" session_id=agent.create_session(\"rag_session\"),\n",
" stream=True,\n",
")\n",
"\n",
"for log in AgentEventLogger().log(response):\n",
" print(log, end=\"\")"
]
},
{
"cell_type": "markdown",
"id": "341aaadf",
"metadata": {},
"source": [
"Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳"
]
},
{
"cell_type": "markdown",
"id": "e88e1185",
"metadata": {},
"source": [
"## Next Steps"
]
},
{
"cell_type": "markdown",
"id": "bcb73600",
"metadata": {},
"source": [
"Now you're ready to dive deeper into Llama Stack!\n",
"- Explore the [Detailed Tutorial](./detailed_tutorial.md).\n",
"- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).\n",
"- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).\n",
"- Learn about Llama Stack [Concepts](../concepts/index.md).\n",
"- Discover how to [Build Llama Stacks](../distributions/index.md).\n",
"- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.\n",
"- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials."
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
} }

View file

@ -47,11 +47,11 @@ function QuickStart() {
<pre><code>{`# Install uv and start Ollama <pre><code>{`# Install uv and start Ollama
ollama run llama3.2:3b --keepalive 60m ollama run llama3.2:3b --keepalive 60m
# Install server dependencies
uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
# Run Llama Stack server # Run Llama Stack server
OLLAMA_URL=http://localhost:11434 \\ OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter
uv run --with llama-stack \\
llama stack build --distro starter \\
--image-type venv --run
# Try the Python SDK # Try the Python SDK
from llama_stack_client import LlamaStackClient from llama_stack_client import LlamaStackClient

File diff suppressed because it is too large Load diff

View file

@ -1569,16 +1569,16 @@ paths:
required: true required: true
deprecated: true deprecated: true
x-llama-stack-extra-body-params: x-llama-stack-extra-body-params:
- name: shields - name: guardrails
schema: schema:
type: array type: array
items: items:
oneOf: oneOf:
- type: string - type: string
- $ref: '#/components/schemas/ResponseShieldSpec' - $ref: '#/components/schemas/ResponseGuardrailSpec'
description: >- description: >-
List of shields to apply during response generation. Shields provide safety List of guardrails to apply during response generation. Guardrails provide
and content moderation. safety and content moderation.
required: false required: false
/v1/openai/v1/responses/{response_id}: /v1/openai/v1/responses/{response_id}:
get: get:
@ -2600,238 +2600,6 @@ paths:
$ref: '#/components/schemas/SupervisedFineTuneRequest' $ref: '#/components/schemas/SupervisedFineTuneRequest'
required: true required: true
deprecated: true deprecated: true
/v1/telemetry/metrics/{metric_name}:
post:
responses:
'200':
description: A QueryMetricsResponse.
content:
application/json:
schema:
$ref: '#/components/schemas/QueryMetricsResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Query metrics.
description: Query metrics.
parameters:
- name: metric_name
in: path
description: The name of the metric to query.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/QueryMetricsRequest'
required: true
deprecated: true
/v1/telemetry/spans:
post:
responses:
'200':
description: A QuerySpansResponse.
content:
application/json:
schema:
$ref: '#/components/schemas/QuerySpansResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Query spans.
description: Query spans.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/QuerySpansRequest'
required: true
deprecated: true
/v1/telemetry/spans/export:
post:
responses:
'200':
description: OK
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Save spans to a dataset.
description: Save spans to a dataset.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
required: true
deprecated: true
/v1/telemetry/spans/{span_id}/tree:
post:
responses:
'200':
description: A QuerySpanTreeResponse.
content:
application/json:
schema:
$ref: '#/components/schemas/QuerySpanTreeResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Get a span tree by its ID.
description: Get a span tree by its ID.
parameters:
- name: span_id
in: path
description: The ID of the span to get the tree from.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/GetSpanTreeRequest'
required: true
deprecated: true
/v1/telemetry/traces:
post:
responses:
'200':
description: A QueryTracesResponse.
content:
application/json:
schema:
$ref: '#/components/schemas/QueryTracesResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Query traces.
description: Query traces.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/QueryTracesRequest'
required: true
deprecated: true
/v1/telemetry/traces/{trace_id}:
get:
responses:
'200':
description: A Trace.
content:
application/json:
schema:
$ref: '#/components/schemas/Trace'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Get a trace by its ID.
description: Get a trace by its ID.
parameters:
- name: trace_id
in: path
description: The ID of the trace to get.
required: true
schema:
type: string
deprecated: true
/v1/telemetry/traces/{trace_id}/spans/{span_id}:
get:
responses:
'200':
description: A Span.
content:
application/json:
schema:
$ref: '#/components/schemas/Span'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Get a span by its ID.
description: Get a span by its ID.
parameters:
- name: trace_id
in: path
description: >-
The ID of the trace to get the span from.
required: true
schema:
type: string
- name: span_id
in: path
description: The ID of the span to get.
required: true
schema:
type: string
deprecated: true
jsonSchemaDialect: >- jsonSchemaDialect: >-
https://json-schema.org/draft/2020-12/schema https://json-schema.org/draft/2020-12/schema
components: components:
@ -4346,7 +4114,7 @@ components:
enum: enum:
- model - model
- shield - shield
- vector_db - vector_store
- dataset - dataset
- scoring_function - scoring_function
- benchmark - benchmark
@ -4535,7 +4303,7 @@ components:
enum: enum:
- model - model
- shield - shield
- vector_db - vector_store
- dataset - dataset
- scoring_function - scoring_function
- benchmark - benchmark
@ -6564,6 +6332,25 @@ components:
url_citation: '#/components/schemas/OpenAIResponseAnnotationCitation' url_citation: '#/components/schemas/OpenAIResponseAnnotationCitation'
container_file_citation: '#/components/schemas/OpenAIResponseAnnotationContainerFileCitation' container_file_citation: '#/components/schemas/OpenAIResponseAnnotationContainerFileCitation'
file_path: '#/components/schemas/OpenAIResponseAnnotationFilePath' file_path: '#/components/schemas/OpenAIResponseAnnotationFilePath'
OpenAIResponseContentPartRefusal:
type: object
properties:
type:
type: string
const: refusal
default: refusal
description: >-
Content part type identifier, always "refusal"
refusal:
type: string
description: Refusal text supplied by the model
additionalProperties: false
required:
- type
- refusal
title: OpenAIResponseContentPartRefusal
description: >-
Refusal content within a streamed response part.
OpenAIResponseError: OpenAIResponseError:
type: object type: object
properties: properties:
@ -6590,6 +6377,8 @@ components:
- $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput' - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
- $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest' - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
- $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse' - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
- $ref: '#/components/schemas/OpenAIResponseMessage' - $ref: '#/components/schemas/OpenAIResponseMessage'
"OpenAIResponseInputFunctionToolCallOutput": "OpenAIResponseInputFunctionToolCallOutput":
type: object type: object
@ -6945,6 +6734,10 @@ components:
$ref: '#/components/schemas/OpenAIResponseUsage' $ref: '#/components/schemas/OpenAIResponseUsage'
description: >- description: >-
(Optional) Token usage information for the response (Optional) Token usage information for the response
instructions:
type: string
description: >-
(Optional) System message inserted into the model's context
input: input:
type: array type: array
items: items:
@ -6985,6 +6778,15 @@ components:
mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
mcp_approval_request: '#/components/schemas/OpenAIResponseMCPApprovalRequest' mcp_approval_request: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
OpenAIResponseOutputMessageContent: OpenAIResponseOutputMessageContent:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseOutputMessageContentOutputText'
- $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
discriminator:
propertyName: type
mapping:
output_text: '#/components/schemas/OpenAIResponseOutputMessageContentOutputText'
refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
"OpenAIResponseOutputMessageContentOutputText":
type: object type: object
properties: properties:
text: text:
@ -7379,18 +7181,18 @@ components:
- total_tokens - total_tokens
title: OpenAIResponseUsage title: OpenAIResponseUsage
description: Usage information for OpenAI response. description: Usage information for OpenAI response.
ResponseShieldSpec: ResponseGuardrailSpec:
type: object type: object
properties: properties:
type: type:
type: string type: string
description: The type/identifier of the shield. description: The type/identifier of the guardrail.
additionalProperties: false additionalProperties: false
required: required:
- type - type
title: ResponseShieldSpec title: ResponseGuardrailSpec
description: >- description: >-
Specification for a shield to apply during response generation. Specification for a guardrail to apply during response generation.
OpenAIResponseInputTool: OpenAIResponseInputTool:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch' - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -7605,6 +7407,10 @@ components:
$ref: '#/components/schemas/OpenAIResponseUsage' $ref: '#/components/schemas/OpenAIResponseUsage'
description: >- description: >-
(Optional) Token usage information for the response (Optional) Token usage information for the response
instructions:
type: string
description: >-
(Optional) System message inserted into the model's context
additionalProperties: false additionalProperties: false
required: required:
- created_at - created_at
@ -7696,25 +7502,6 @@ components:
title: OpenAIResponseContentPartReasoningText title: OpenAIResponseContentPartReasoningText
description: >- description: >-
Reasoning text emitted as part of a streamed response. Reasoning text emitted as part of a streamed response.
OpenAIResponseContentPartRefusal:
type: object
properties:
type:
type: string
const: refusal
default: refusal
description: >-
Content part type identifier, always "refusal"
refusal:
type: string
description: Refusal text supplied by the model
additionalProperties: false
required:
- type
- refusal
title: OpenAIResponseContentPartRefusal
description: >-
Refusal content within a streamed response part.
OpenAIResponseObjectStream: OpenAIResponseObjectStream:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated' - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
@ -10341,434 +10128,6 @@ components:
- hyperparam_search_config - hyperparam_search_config
- logger_config - logger_config
title: SupervisedFineTuneRequest title: SupervisedFineTuneRequest
QueryMetricsRequest:
type: object
properties:
start_time:
type: integer
description: The start time of the metric to query.
end_time:
type: integer
description: The end time of the metric to query.
granularity:
type: string
description: The granularity of the metric to query.
query_type:
type: string
enum:
- range
- instant
description: The type of query to perform.
label_matchers:
type: array
items:
type: object
properties:
name:
type: string
description: The name of the label to match
value:
type: string
description: The value to match against
operator:
type: string
enum:
- '='
- '!='
- =~
- '!~'
description: >-
The comparison operator to use for matching
default: '='
additionalProperties: false
required:
- name
- value
- operator
title: MetricLabelMatcher
description: >-
A matcher for filtering metrics by label values.
description: >-
The label matchers to apply to the metric.
additionalProperties: false
required:
- start_time
- query_type
title: QueryMetricsRequest
MetricDataPoint:
type: object
properties:
timestamp:
type: integer
description: >-
Unix timestamp when the metric value was recorded
value:
type: number
description: >-
The numeric value of the metric at this timestamp
unit:
type: string
additionalProperties: false
required:
- timestamp
- value
- unit
title: MetricDataPoint
description: >-
A single data point in a metric time series.
MetricLabel:
type: object
properties:
name:
type: string
description: The name of the label
value:
type: string
description: The value of the label
additionalProperties: false
required:
- name
- value
title: MetricLabel
description: A label associated with a metric.
MetricSeries:
type: object
properties:
metric:
type: string
description: The name of the metric
labels:
type: array
items:
$ref: '#/components/schemas/MetricLabel'
description: >-
List of labels associated with this metric series
values:
type: array
items:
$ref: '#/components/schemas/MetricDataPoint'
description: >-
List of data points in chronological order
additionalProperties: false
required:
- metric
- labels
- values
title: MetricSeries
description: A time series of metric data points.
QueryMetricsResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/MetricSeries'
description: >-
List of metric series matching the query criteria
additionalProperties: false
required:
- data
title: QueryMetricsResponse
description: >-
Response containing metric time series data.
QueryCondition:
type: object
properties:
key:
type: string
description: The attribute key to filter on
op:
$ref: '#/components/schemas/QueryConditionOp'
description: The comparison operator to apply
value:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: The value to compare against
additionalProperties: false
required:
- key
- op
- value
title: QueryCondition
description: A condition for filtering query results.
QueryConditionOp:
type: string
enum:
- eq
- ne
- gt
- lt
title: QueryConditionOp
description: >-
Comparison operators for query conditions.
QuerySpansRequest:
type: object
properties:
attribute_filters:
type: array
items:
$ref: '#/components/schemas/QueryCondition'
description: >-
The attribute filters to apply to the spans.
attributes_to_return:
type: array
items:
type: string
description: The attributes to return in the spans.
max_depth:
type: integer
description: The maximum depth of the tree.
additionalProperties: false
required:
- attribute_filters
- attributes_to_return
title: QuerySpansRequest
Span:
type: object
properties:
span_id:
type: string
description: Unique identifier for the span
trace_id:
type: string
description: >-
Unique identifier for the trace this span belongs to
parent_span_id:
type: string
description: >-
(Optional) Unique identifier for the parent span, if this is a child span
name:
type: string
description: >-
Human-readable name describing the operation this span represents
start_time:
type: string
format: date-time
description: Timestamp when the operation began
end_time:
type: string
format: date-time
description: >-
(Optional) Timestamp when the operation finished, if completed
attributes:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
(Optional) Key-value pairs containing additional metadata about the span
additionalProperties: false
required:
- span_id
- trace_id
- name
- start_time
title: Span
description: >-
A span representing a single operation within a trace.
QuerySpansResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/Span'
description: >-
List of spans matching the query criteria
additionalProperties: false
required:
- data
title: QuerySpansResponse
description: Response containing a list of spans.
SaveSpansToDatasetRequest:
type: object
properties:
attribute_filters:
type: array
items:
$ref: '#/components/schemas/QueryCondition'
description: >-
The attribute filters to apply to the spans.
attributes_to_save:
type: array
items:
type: string
description: The attributes to save to the dataset.
dataset_id:
type: string
description: >-
The ID of the dataset to save the spans to.
max_depth:
type: integer
description: The maximum depth of the tree.
additionalProperties: false
required:
- attribute_filters
- attributes_to_save
- dataset_id
title: SaveSpansToDatasetRequest
GetSpanTreeRequest:
type: object
properties:
attributes_to_return:
type: array
items:
type: string
description: The attributes to return in the tree.
max_depth:
type: integer
description: The maximum depth of the tree.
additionalProperties: false
title: GetSpanTreeRequest
SpanStatus:
type: string
enum:
- ok
- error
title: SpanStatus
description: >-
The status of a span indicating whether it completed successfully or with
an error.
SpanWithStatus:
type: object
properties:
span_id:
type: string
description: Unique identifier for the span
trace_id:
type: string
description: >-
Unique identifier for the trace this span belongs to
parent_span_id:
type: string
description: >-
(Optional) Unique identifier for the parent span, if this is a child span
name:
type: string
description: >-
Human-readable name describing the operation this span represents
start_time:
type: string
format: date-time
description: Timestamp when the operation began
end_time:
type: string
format: date-time
description: >-
(Optional) Timestamp when the operation finished, if completed
attributes:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
(Optional) Key-value pairs containing additional metadata about the span
status:
$ref: '#/components/schemas/SpanStatus'
description: >-
(Optional) The current status of the span
additionalProperties: false
required:
- span_id
- trace_id
- name
- start_time
title: SpanWithStatus
description: A span that includes status information.
QuerySpanTreeResponse:
type: object
properties:
data:
type: object
additionalProperties:
$ref: '#/components/schemas/SpanWithStatus'
description: >-
Dictionary mapping span IDs to spans with status information
additionalProperties: false
required:
- data
title: QuerySpanTreeResponse
description: >-
Response containing a tree structure of spans.
QueryTracesRequest:
type: object
properties:
attribute_filters:
type: array
items:
$ref: '#/components/schemas/QueryCondition'
description: >-
The attribute filters to apply to the traces.
limit:
type: integer
description: The limit of traces to return.
offset:
type: integer
description: The offset of the traces to return.
order_by:
type: array
items:
type: string
description: The order by of the traces to return.
additionalProperties: false
title: QueryTracesRequest
Trace:
type: object
properties:
trace_id:
type: string
description: Unique identifier for the trace
root_span_id:
type: string
description: >-
Unique identifier for the root span that started this trace
start_time:
type: string
format: date-time
description: Timestamp when the trace began
end_time:
type: string
format: date-time
description: >-
(Optional) Timestamp when the trace finished, if completed
additionalProperties: false
required:
- trace_id
- root_span_id
- start_time
title: Trace
description: >-
A trace representing the complete execution path of a request across multiple
operations.
QueryTracesResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/Trace'
description: >-
List of traces matching the query criteria
additionalProperties: false
required:
- data
title: QueryTracesResponse
description: Response containing a list of traces.
responses: responses:
BadRequest400: BadRequest400:
description: The request was invalid or malformed description: The request was invalid or malformed
@ -10845,9 +10204,9 @@ tags:
- name: Datasets - name: Datasets
description: '' description: ''
- name: Eval - name: Eval
description: '' description: >-
x-displayName: >-
Llama Stack Evaluation API for running evaluations on model and agent candidates. Llama Stack Evaluation API for running evaluations on model and agent candidates.
x-displayName: Evaluations
- name: Files - name: Files
description: >- description: >-
This API is used to upload documents that can be used with other Llama Stack This API is used to upload documents that can be used with other Llama Stack
@ -10874,8 +10233,6 @@ tags:
- name: Safety - name: Safety
description: OpenAI-compatible Moderations API. description: OpenAI-compatible Moderations API.
x-displayName: Safety x-displayName: Safety
- name: Telemetry
description: ''
- name: VectorIO - name: VectorIO
description: '' description: ''
x-tagGroups: x-tagGroups:
@ -10891,5 +10248,4 @@ x-tagGroups:
- Models - Models
- PostTraining (Coming Soon) - PostTraining (Coming Soon)
- Safety - Safety
- Telemetry
- VectorIO - VectorIO

View file

@ -1711,343 +1711,6 @@
}, },
"deprecated": false "deprecated": false
} }
},
"/v1alpha/telemetry/metrics/{metric_name}": {
"post": {
"responses": {
"200": {
"description": "A QueryMetricsResponse.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QueryMetricsResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Telemetry"
],
"summary": "Query metrics.",
"description": "Query metrics.",
"parameters": [
{
"name": "metric_name",
"in": "path",
"description": "The name of the metric to query.",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QueryMetricsRequest"
}
}
},
"required": true
},
"deprecated": false
}
},
"/v1alpha/telemetry/spans": {
"post": {
"responses": {
"200": {
"description": "A QuerySpansResponse.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QuerySpansResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Telemetry"
],
"summary": "Query spans.",
"description": "Query spans.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QuerySpansRequest"
}
}
},
"required": true
},
"deprecated": false
}
},
"/v1alpha/telemetry/spans/export": {
"post": {
"responses": {
"200": {
"description": "OK"
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Telemetry"
],
"summary": "Save spans to a dataset.",
"description": "Save spans to a dataset.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/SaveSpansToDatasetRequest"
}
}
},
"required": true
},
"deprecated": false
}
},
"/v1alpha/telemetry/spans/{span_id}/tree": {
"post": {
"responses": {
"200": {
"description": "A QuerySpanTreeResponse.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QuerySpanTreeResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Telemetry"
],
"summary": "Get a span tree by its ID.",
"description": "Get a span tree by its ID.",
"parameters": [
{
"name": "span_id",
"in": "path",
"description": "The ID of the span to get the tree from.",
"required": true,
"schema": {
"type": "string"
}
}
],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GetSpanTreeRequest"
}
}
},
"required": true
},
"deprecated": false
}
},
"/v1alpha/telemetry/traces": {
"post": {
"responses": {
"200": {
"description": "A QueryTracesResponse.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QueryTracesResponse"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Telemetry"
],
"summary": "Query traces.",
"description": "Query traces.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/QueryTracesRequest"
}
}
},
"required": true
},
"deprecated": false
}
},
"/v1alpha/telemetry/traces/{trace_id}": {
"get": {
"responses": {
"200": {
"description": "A Trace.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Trace"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Telemetry"
],
"summary": "Get a trace by its ID.",
"description": "Get a trace by its ID.",
"parameters": [
{
"name": "trace_id",
"in": "path",
"description": "The ID of the trace to get.",
"required": true,
"schema": {
"type": "string"
}
}
],
"deprecated": false
}
},
"/v1alpha/telemetry/traces/{trace_id}/spans/{span_id}": {
"get": {
"responses": {
"200": {
"description": "A Span.",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Span"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Telemetry"
],
"summary": "Get a span by its ID.",
"description": "Get a span by its ID.",
"parameters": [
{
"name": "trace_id",
"in": "path",
"description": "The ID of the trace to get the span from.",
"required": true,
"schema": {
"type": "string"
}
},
{
"name": "span_id",
"in": "path",
"description": "The ID of the span to get.",
"required": true,
"schema": {
"type": "string"
}
}
],
"deprecated": false
}
} }
}, },
"jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema", "jsonSchemaDialect": "https://json-schema.org/draft/2020-12/schema",
@ -2187,7 +1850,7 @@
"enum": [ "enum": [
"model", "model",
"shield", "shield",
"vector_db", "vector_store",
"dataset", "dataset",
"scoring_function", "scoring_function",
"benchmark", "benchmark",
@ -4320,7 +3983,7 @@
"enum": [ "enum": [
"model", "model",
"shield", "shield",
"vector_db", "vector_store",
"dataset", "dataset",
"scoring_function", "scoring_function",
"benchmark", "benchmark",
@ -5765,561 +5428,6 @@
"logger_config" "logger_config"
], ],
"title": "SupervisedFineTuneRequest" "title": "SupervisedFineTuneRequest"
},
"QueryMetricsRequest": {
"type": "object",
"properties": {
"start_time": {
"type": "integer",
"description": "The start time of the metric to query."
},
"end_time": {
"type": "integer",
"description": "The end time of the metric to query."
},
"granularity": {
"type": "string",
"description": "The granularity of the metric to query."
},
"query_type": {
"type": "string",
"enum": [
"range",
"instant"
],
"description": "The type of query to perform."
},
"label_matchers": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The name of the label to match"
},
"value": {
"type": "string",
"description": "The value to match against"
},
"operator": {
"type": "string",
"enum": [
"=",
"!=",
"=~",
"!~"
],
"description": "The comparison operator to use for matching",
"default": "="
}
},
"additionalProperties": false,
"required": [
"name",
"value",
"operator"
],
"title": "MetricLabelMatcher",
"description": "A matcher for filtering metrics by label values."
},
"description": "The label matchers to apply to the metric."
}
},
"additionalProperties": false,
"required": [
"start_time",
"query_type"
],
"title": "QueryMetricsRequest"
},
"MetricDataPoint": {
"type": "object",
"properties": {
"timestamp": {
"type": "integer",
"description": "Unix timestamp when the metric value was recorded"
},
"value": {
"type": "number",
"description": "The numeric value of the metric at this timestamp"
},
"unit": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"timestamp",
"value",
"unit"
],
"title": "MetricDataPoint",
"description": "A single data point in a metric time series."
},
"MetricLabel": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The name of the label"
},
"value": {
"type": "string",
"description": "The value of the label"
}
},
"additionalProperties": false,
"required": [
"name",
"value"
],
"title": "MetricLabel",
"description": "A label associated with a metric."
},
"MetricSeries": {
"type": "object",
"properties": {
"metric": {
"type": "string",
"description": "The name of the metric"
},
"labels": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricLabel"
},
"description": "List of labels associated with this metric series"
},
"values": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricDataPoint"
},
"description": "List of data points in chronological order"
}
},
"additionalProperties": false,
"required": [
"metric",
"labels",
"values"
],
"title": "MetricSeries",
"description": "A time series of metric data points."
},
"QueryMetricsResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/MetricSeries"
},
"description": "List of metric series matching the query criteria"
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "QueryMetricsResponse",
"description": "Response containing metric time series data."
},
"QueryCondition": {
"type": "object",
"properties": {
"key": {
"type": "string",
"description": "The attribute key to filter on"
},
"op": {
"$ref": "#/components/schemas/QueryConditionOp",
"description": "The comparison operator to apply"
},
"value": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
],
"description": "The value to compare against"
}
},
"additionalProperties": false,
"required": [
"key",
"op",
"value"
],
"title": "QueryCondition",
"description": "A condition for filtering query results."
},
"QueryConditionOp": {
"type": "string",
"enum": [
"eq",
"ne",
"gt",
"lt"
],
"title": "QueryConditionOp",
"description": "Comparison operators for query conditions."
},
"QuerySpansRequest": {
"type": "object",
"properties": {
"attribute_filters": {
"type": "array",
"items": {
"$ref": "#/components/schemas/QueryCondition"
},
"description": "The attribute filters to apply to the spans."
},
"attributes_to_return": {
"type": "array",
"items": {
"type": "string"
},
"description": "The attributes to return in the spans."
},
"max_depth": {
"type": "integer",
"description": "The maximum depth of the tree."
}
},
"additionalProperties": false,
"required": [
"attribute_filters",
"attributes_to_return"
],
"title": "QuerySpansRequest"
},
"Span": {
"type": "object",
"properties": {
"span_id": {
"type": "string",
"description": "Unique identifier for the span"
},
"trace_id": {
"type": "string",
"description": "Unique identifier for the trace this span belongs to"
},
"parent_span_id": {
"type": "string",
"description": "(Optional) Unique identifier for the parent span, if this is a child span"
},
"name": {
"type": "string",
"description": "Human-readable name describing the operation this span represents"
},
"start_time": {
"type": "string",
"format": "date-time",
"description": "Timestamp when the operation began"
},
"end_time": {
"type": "string",
"format": "date-time",
"description": "(Optional) Timestamp when the operation finished, if completed"
},
"attributes": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "(Optional) Key-value pairs containing additional metadata about the span"
}
},
"additionalProperties": false,
"required": [
"span_id",
"trace_id",
"name",
"start_time"
],
"title": "Span",
"description": "A span representing a single operation within a trace."
},
"QuerySpansResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Span"
},
"description": "List of spans matching the query criteria"
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "QuerySpansResponse",
"description": "Response containing a list of spans."
},
"SaveSpansToDatasetRequest": {
"type": "object",
"properties": {
"attribute_filters": {
"type": "array",
"items": {
"$ref": "#/components/schemas/QueryCondition"
},
"description": "The attribute filters to apply to the spans."
},
"attributes_to_save": {
"type": "array",
"items": {
"type": "string"
},
"description": "The attributes to save to the dataset."
},
"dataset_id": {
"type": "string",
"description": "The ID of the dataset to save the spans to."
},
"max_depth": {
"type": "integer",
"description": "The maximum depth of the tree."
}
},
"additionalProperties": false,
"required": [
"attribute_filters",
"attributes_to_save",
"dataset_id"
],
"title": "SaveSpansToDatasetRequest"
},
"GetSpanTreeRequest": {
"type": "object",
"properties": {
"attributes_to_return": {
"type": "array",
"items": {
"type": "string"
},
"description": "The attributes to return in the tree."
},
"max_depth": {
"type": "integer",
"description": "The maximum depth of the tree."
}
},
"additionalProperties": false,
"title": "GetSpanTreeRequest"
},
"SpanStatus": {
"type": "string",
"enum": [
"ok",
"error"
],
"title": "SpanStatus",
"description": "The status of a span indicating whether it completed successfully or with an error."
},
"SpanWithStatus": {
"type": "object",
"properties": {
"span_id": {
"type": "string",
"description": "Unique identifier for the span"
},
"trace_id": {
"type": "string",
"description": "Unique identifier for the trace this span belongs to"
},
"parent_span_id": {
"type": "string",
"description": "(Optional) Unique identifier for the parent span, if this is a child span"
},
"name": {
"type": "string",
"description": "Human-readable name describing the operation this span represents"
},
"start_time": {
"type": "string",
"format": "date-time",
"description": "Timestamp when the operation began"
},
"end_time": {
"type": "string",
"format": "date-time",
"description": "(Optional) Timestamp when the operation finished, if completed"
},
"attributes": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "null"
},
{
"type": "boolean"
},
{
"type": "number"
},
{
"type": "string"
},
{
"type": "array"
},
{
"type": "object"
}
]
},
"description": "(Optional) Key-value pairs containing additional metadata about the span"
},
"status": {
"$ref": "#/components/schemas/SpanStatus",
"description": "(Optional) The current status of the span"
}
},
"additionalProperties": false,
"required": [
"span_id",
"trace_id",
"name",
"start_time"
],
"title": "SpanWithStatus",
"description": "A span that includes status information."
},
"QuerySpanTreeResponse": {
"type": "object",
"properties": {
"data": {
"type": "object",
"additionalProperties": {
"$ref": "#/components/schemas/SpanWithStatus"
},
"description": "Dictionary mapping span IDs to spans with status information"
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "QuerySpanTreeResponse",
"description": "Response containing a tree structure of spans."
},
"QueryTracesRequest": {
"type": "object",
"properties": {
"attribute_filters": {
"type": "array",
"items": {
"$ref": "#/components/schemas/QueryCondition"
},
"description": "The attribute filters to apply to the traces."
},
"limit": {
"type": "integer",
"description": "The limit of traces to return."
},
"offset": {
"type": "integer",
"description": "The offset of the traces to return."
},
"order_by": {
"type": "array",
"items": {
"type": "string"
},
"description": "The order by of the traces to return."
}
},
"additionalProperties": false,
"title": "QueryTracesRequest"
},
"Trace": {
"type": "object",
"properties": {
"trace_id": {
"type": "string",
"description": "Unique identifier for the trace"
},
"root_span_id": {
"type": "string",
"description": "Unique identifier for the root span that started this trace"
},
"start_time": {
"type": "string",
"format": "date-time",
"description": "Timestamp when the trace began"
},
"end_time": {
"type": "string",
"format": "date-time",
"description": "(Optional) Timestamp when the trace finished, if completed"
}
},
"additionalProperties": false,
"required": [
"trace_id",
"root_span_id",
"start_time"
],
"title": "Trace",
"description": "A trace representing the complete execution path of a request across multiple operations."
},
"QueryTracesResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Trace"
},
"description": "List of traces matching the query criteria"
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "QueryTracesResponse",
"description": "Response containing a list of traces."
} }
}, },
"responses": { "responses": {
@ -6410,16 +5518,12 @@
}, },
{ {
"name": "Eval", "name": "Eval",
"description": "", "description": "Llama Stack Evaluation API for running evaluations on model and agent candidates.",
"x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates." "x-displayName": "Evaluations"
}, },
{ {
"name": "PostTraining (Coming Soon)", "name": "PostTraining (Coming Soon)",
"description": "" "description": ""
},
{
"name": "Telemetry",
"description": ""
} }
], ],
"x-tagGroups": [ "x-tagGroups": [
@ -6431,8 +5535,7 @@
"DatasetIO", "DatasetIO",
"Datasets", "Datasets",
"Eval", "Eval",
"PostTraining (Coming Soon)", "PostTraining (Coming Soon)"
"Telemetry"
] ]
} }
] ]

View file

@ -1224,238 +1224,6 @@ paths:
$ref: '#/components/schemas/SupervisedFineTuneRequest' $ref: '#/components/schemas/SupervisedFineTuneRequest'
required: true required: true
deprecated: false deprecated: false
/v1alpha/telemetry/metrics/{metric_name}:
post:
responses:
'200':
description: A QueryMetricsResponse.
content:
application/json:
schema:
$ref: '#/components/schemas/QueryMetricsResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Query metrics.
description: Query metrics.
parameters:
- name: metric_name
in: path
description: The name of the metric to query.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/QueryMetricsRequest'
required: true
deprecated: false
/v1alpha/telemetry/spans:
post:
responses:
'200':
description: A QuerySpansResponse.
content:
application/json:
schema:
$ref: '#/components/schemas/QuerySpansResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Query spans.
description: Query spans.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/QuerySpansRequest'
required: true
deprecated: false
/v1alpha/telemetry/spans/export:
post:
responses:
'200':
description: OK
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Save spans to a dataset.
description: Save spans to a dataset.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/SaveSpansToDatasetRequest'
required: true
deprecated: false
/v1alpha/telemetry/spans/{span_id}/tree:
post:
responses:
'200':
description: A QuerySpanTreeResponse.
content:
application/json:
schema:
$ref: '#/components/schemas/QuerySpanTreeResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Get a span tree by its ID.
description: Get a span tree by its ID.
parameters:
- name: span_id
in: path
description: The ID of the span to get the tree from.
required: true
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/GetSpanTreeRequest'
required: true
deprecated: false
/v1alpha/telemetry/traces:
post:
responses:
'200':
description: A QueryTracesResponse.
content:
application/json:
schema:
$ref: '#/components/schemas/QueryTracesResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Query traces.
description: Query traces.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/QueryTracesRequest'
required: true
deprecated: false
/v1alpha/telemetry/traces/{trace_id}:
get:
responses:
'200':
description: A Trace.
content:
application/json:
schema:
$ref: '#/components/schemas/Trace'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Get a trace by its ID.
description: Get a trace by its ID.
parameters:
- name: trace_id
in: path
description: The ID of the trace to get.
required: true
schema:
type: string
deprecated: false
/v1alpha/telemetry/traces/{trace_id}/spans/{span_id}:
get:
responses:
'200':
description: A Span.
content:
application/json:
schema:
$ref: '#/components/schemas/Span'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Get a span by its ID.
description: Get a span by its ID.
parameters:
- name: trace_id
in: path
description: >-
The ID of the trace to get the span from.
required: true
schema:
type: string
- name: span_id
in: path
description: The ID of the span to get.
required: true
schema:
type: string
deprecated: false
jsonSchemaDialect: >- jsonSchemaDialect: >-
https://json-schema.org/draft/2020-12/schema https://json-schema.org/draft/2020-12/schema
components: components:
@ -1552,7 +1320,7 @@ components:
enum: enum:
- model - model
- shield - shield
- vector_db - vector_store
- dataset - dataset
- scoring_function - scoring_function
- benchmark - benchmark
@ -3159,7 +2927,7 @@ components:
enum: enum:
- model - model
- shield - shield
- vector_db - vector_store
- dataset - dataset
- scoring_function - scoring_function
- benchmark - benchmark
@ -4249,434 +4017,6 @@ components:
- hyperparam_search_config - hyperparam_search_config
- logger_config - logger_config
title: SupervisedFineTuneRequest title: SupervisedFineTuneRequest
QueryMetricsRequest:
type: object
properties:
start_time:
type: integer
description: The start time of the metric to query.
end_time:
type: integer
description: The end time of the metric to query.
granularity:
type: string
description: The granularity of the metric to query.
query_type:
type: string
enum:
- range
- instant
description: The type of query to perform.
label_matchers:
type: array
items:
type: object
properties:
name:
type: string
description: The name of the label to match
value:
type: string
description: The value to match against
operator:
type: string
enum:
- '='
- '!='
- =~
- '!~'
description: >-
The comparison operator to use for matching
default: '='
additionalProperties: false
required:
- name
- value
- operator
title: MetricLabelMatcher
description: >-
A matcher for filtering metrics by label values.
description: >-
The label matchers to apply to the metric.
additionalProperties: false
required:
- start_time
- query_type
title: QueryMetricsRequest
MetricDataPoint:
type: object
properties:
timestamp:
type: integer
description: >-
Unix timestamp when the metric value was recorded
value:
type: number
description: >-
The numeric value of the metric at this timestamp
unit:
type: string
additionalProperties: false
required:
- timestamp
- value
- unit
title: MetricDataPoint
description: >-
A single data point in a metric time series.
MetricLabel:
type: object
properties:
name:
type: string
description: The name of the label
value:
type: string
description: The value of the label
additionalProperties: false
required:
- name
- value
title: MetricLabel
description: A label associated with a metric.
MetricSeries:
type: object
properties:
metric:
type: string
description: The name of the metric
labels:
type: array
items:
$ref: '#/components/schemas/MetricLabel'
description: >-
List of labels associated with this metric series
values:
type: array
items:
$ref: '#/components/schemas/MetricDataPoint'
description: >-
List of data points in chronological order
additionalProperties: false
required:
- metric
- labels
- values
title: MetricSeries
description: A time series of metric data points.
QueryMetricsResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/MetricSeries'
description: >-
List of metric series matching the query criteria
additionalProperties: false
required:
- data
title: QueryMetricsResponse
description: >-
Response containing metric time series data.
QueryCondition:
type: object
properties:
key:
type: string
description: The attribute key to filter on
op:
$ref: '#/components/schemas/QueryConditionOp'
description: The comparison operator to apply
value:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: The value to compare against
additionalProperties: false
required:
- key
- op
- value
title: QueryCondition
description: A condition for filtering query results.
QueryConditionOp:
type: string
enum:
- eq
- ne
- gt
- lt
title: QueryConditionOp
description: >-
Comparison operators for query conditions.
QuerySpansRequest:
type: object
properties:
attribute_filters:
type: array
items:
$ref: '#/components/schemas/QueryCondition'
description: >-
The attribute filters to apply to the spans.
attributes_to_return:
type: array
items:
type: string
description: The attributes to return in the spans.
max_depth:
type: integer
description: The maximum depth of the tree.
additionalProperties: false
required:
- attribute_filters
- attributes_to_return
title: QuerySpansRequest
Span:
type: object
properties:
span_id:
type: string
description: Unique identifier for the span
trace_id:
type: string
description: >-
Unique identifier for the trace this span belongs to
parent_span_id:
type: string
description: >-
(Optional) Unique identifier for the parent span, if this is a child span
name:
type: string
description: >-
Human-readable name describing the operation this span represents
start_time:
type: string
format: date-time
description: Timestamp when the operation began
end_time:
type: string
format: date-time
description: >-
(Optional) Timestamp when the operation finished, if completed
attributes:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
(Optional) Key-value pairs containing additional metadata about the span
additionalProperties: false
required:
- span_id
- trace_id
- name
- start_time
title: Span
description: >-
A span representing a single operation within a trace.
QuerySpansResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/Span'
description: >-
List of spans matching the query criteria
additionalProperties: false
required:
- data
title: QuerySpansResponse
description: Response containing a list of spans.
SaveSpansToDatasetRequest:
type: object
properties:
attribute_filters:
type: array
items:
$ref: '#/components/schemas/QueryCondition'
description: >-
The attribute filters to apply to the spans.
attributes_to_save:
type: array
items:
type: string
description: The attributes to save to the dataset.
dataset_id:
type: string
description: >-
The ID of the dataset to save the spans to.
max_depth:
type: integer
description: The maximum depth of the tree.
additionalProperties: false
required:
- attribute_filters
- attributes_to_save
- dataset_id
title: SaveSpansToDatasetRequest
GetSpanTreeRequest:
type: object
properties:
attributes_to_return:
type: array
items:
type: string
description: The attributes to return in the tree.
max_depth:
type: integer
description: The maximum depth of the tree.
additionalProperties: false
title: GetSpanTreeRequest
SpanStatus:
type: string
enum:
- ok
- error
title: SpanStatus
description: >-
The status of a span indicating whether it completed successfully or with
an error.
SpanWithStatus:
type: object
properties:
span_id:
type: string
description: Unique identifier for the span
trace_id:
type: string
description: >-
Unique identifier for the trace this span belongs to
parent_span_id:
type: string
description: >-
(Optional) Unique identifier for the parent span, if this is a child span
name:
type: string
description: >-
Human-readable name describing the operation this span represents
start_time:
type: string
format: date-time
description: Timestamp when the operation began
end_time:
type: string
format: date-time
description: >-
(Optional) Timestamp when the operation finished, if completed
attributes:
type: object
additionalProperties:
oneOf:
- type: 'null'
- type: boolean
- type: number
- type: string
- type: array
- type: object
description: >-
(Optional) Key-value pairs containing additional metadata about the span
status:
$ref: '#/components/schemas/SpanStatus'
description: >-
(Optional) The current status of the span
additionalProperties: false
required:
- span_id
- trace_id
- name
- start_time
title: SpanWithStatus
description: A span that includes status information.
QuerySpanTreeResponse:
type: object
properties:
data:
type: object
additionalProperties:
$ref: '#/components/schemas/SpanWithStatus'
description: >-
Dictionary mapping span IDs to spans with status information
additionalProperties: false
required:
- data
title: QuerySpanTreeResponse
description: >-
Response containing a tree structure of spans.
QueryTracesRequest:
type: object
properties:
attribute_filters:
type: array
items:
$ref: '#/components/schemas/QueryCondition'
description: >-
The attribute filters to apply to the traces.
limit:
type: integer
description: The limit of traces to return.
offset:
type: integer
description: The offset of the traces to return.
order_by:
type: array
items:
type: string
description: The order by of the traces to return.
additionalProperties: false
title: QueryTracesRequest
Trace:
type: object
properties:
trace_id:
type: string
description: Unique identifier for the trace
root_span_id:
type: string
description: >-
Unique identifier for the root span that started this trace
start_time:
type: string
format: date-time
description: Timestamp when the trace began
end_time:
type: string
format: date-time
description: >-
(Optional) Timestamp when the trace finished, if completed
additionalProperties: false
required:
- trace_id
- root_span_id
- start_time
title: Trace
description: >-
A trace representing the complete execution path of a request across multiple
operations.
QueryTracesResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/Trace'
description: >-
List of traces matching the query criteria
additionalProperties: false
required:
- data
title: QueryTracesResponse
description: Response containing a list of traces.
responses: responses:
BadRequest400: BadRequest400:
description: The request was invalid or malformed description: The request was invalid or malformed
@ -4779,13 +4119,11 @@ tags:
- name: Datasets - name: Datasets
description: '' description: ''
- name: Eval - name: Eval
description: '' description: >-
x-displayName: >-
Llama Stack Evaluation API for running evaluations on model and agent candidates. Llama Stack Evaluation API for running evaluations on model and agent candidates.
x-displayName: Evaluations
- name: PostTraining (Coming Soon) - name: PostTraining (Coming Soon)
description: '' description: ''
- name: Telemetry
description: ''
x-tagGroups: x-tagGroups:
- name: Operations - name: Operations
tags: tags:
@ -4795,4 +4133,3 @@ x-tagGroups:
- Datasets - Datasets
- Eval - Eval
- PostTraining (Coming Soon) - PostTraining (Coming Soon)
- Telemetry

View file

@ -282,7 +282,7 @@
"Conversations" "Conversations"
], ],
"summary": "Create a conversation.", "summary": "Create a conversation.",
"description": "Create a conversation.", "description": "Create a conversation.\nCreate a conversation.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -326,8 +326,8 @@
"tags": [ "tags": [
"Conversations" "Conversations"
], ],
"summary": "Get a conversation with the given ID.", "summary": "Retrieve a conversation.",
"description": "Get a conversation with the given ID.", "description": "Retrieve a conversation.\nGet a conversation with the given ID.",
"parameters": [ "parameters": [
{ {
"name": "conversation_id", "name": "conversation_id",
@ -369,8 +369,8 @@
"tags": [ "tags": [
"Conversations" "Conversations"
], ],
"summary": "Update a conversation's metadata with the given ID.", "summary": "Update a conversation.",
"description": "Update a conversation's metadata with the given ID.", "description": "Update a conversation.\nUpdate a conversation's metadata with the given ID.",
"parameters": [ "parameters": [
{ {
"name": "conversation_id", "name": "conversation_id",
@ -422,8 +422,8 @@
"tags": [ "tags": [
"Conversations" "Conversations"
], ],
"summary": "Delete a conversation with the given ID.", "summary": "Delete a conversation.",
"description": "Delete a conversation with the given ID.", "description": "Delete a conversation.\nDelete a conversation with the given ID.",
"parameters": [ "parameters": [
{ {
"name": "conversation_id", "name": "conversation_id",
@ -467,8 +467,8 @@
"tags": [ "tags": [
"Conversations" "Conversations"
], ],
"summary": "List items in the conversation.", "summary": "List items.",
"description": "List items in the conversation.", "description": "List items.\nList items in the conversation.",
"parameters": [ "parameters": [
{ {
"name": "conversation_id", "name": "conversation_id",
@ -597,8 +597,8 @@
"tags": [ "tags": [
"Conversations" "Conversations"
], ],
"summary": "Create items in the conversation.", "summary": "Create items.",
"description": "Create items in the conversation.", "description": "Create items.\nCreate items in the conversation.",
"parameters": [ "parameters": [
{ {
"name": "conversation_id", "name": "conversation_id",
@ -652,8 +652,8 @@
"tags": [ "tags": [
"Conversations" "Conversations"
], ],
"summary": "Retrieve a conversation item.", "summary": "Retrieve an item.",
"description": "Retrieve a conversation item.", "description": "Retrieve an item.\nRetrieve a conversation item.",
"parameters": [ "parameters": [
{ {
"name": "conversation_id", "name": "conversation_id",
@ -704,8 +704,8 @@
"tags": [ "tags": [
"Conversations" "Conversations"
], ],
"summary": "Delete a conversation item.", "summary": "Delete an item.",
"description": "Delete a conversation item.", "description": "Delete an item.\nDelete a conversation item.",
"parameters": [ "parameters": [
{ {
"name": "conversation_id", "name": "conversation_id",
@ -1833,7 +1833,7 @@
"deprecated": false, "deprecated": false,
"x-llama-stack-extra-body-params": [ "x-llama-stack-extra-body-params": [
{ {
"name": "shields", "name": "guardrails",
"schema": { "schema": {
"type": "array", "type": "array",
"items": { "items": {
@ -1842,12 +1842,12 @@
"type": "string" "type": "string"
}, },
{ {
"$ref": "#/components/schemas/ResponseShieldSpec" "$ref": "#/components/schemas/ResponseGuardrailSpec"
} }
] ]
} }
}, },
"description": "List of shields to apply during response generation. Shields provide safety and content moderation.", "description": "List of guardrails to apply during response generation. Guardrails provide safety and content moderation.",
"required": false "required": false
} }
] ]
@ -2525,44 +2525,6 @@
"deprecated": false "deprecated": false
} }
}, },
"/v1/telemetry/events": {
"post": {
"responses": {
"200": {
"description": "OK"
},
"400": {
"$ref": "#/components/responses/BadRequest400"
},
"429": {
"$ref": "#/components/responses/TooManyRequests429"
},
"500": {
"$ref": "#/components/responses/InternalServerError500"
},
"default": {
"$ref": "#/components/responses/DefaultError"
}
},
"tags": [
"Telemetry"
],
"summary": "Log an event.",
"description": "Log an event.",
"parameters": [],
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/LogEventRequest"
}
}
},
"required": true
},
"deprecated": false
}
},
"/v1/tool-runtime/invoke": { "/v1/tool-runtime/invoke": {
"post": { "post": {
"responses": { "responses": {
@ -5517,13 +5479,22 @@
"$ref": "#/components/schemas/OpenAIResponseMessage" "$ref": "#/components/schemas/OpenAIResponseMessage"
}, },
{ {
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall" "$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall"
}, },
{ {
"$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall" "$ref": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall"
}, },
{ {
"$ref": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall" "$ref": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall"
},
{
"$ref": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput"
},
{
"$ref": "#/components/schemas/OpenAIResponseMCPApprovalRequest"
},
{
"$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse"
}, },
{ {
"$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall" "$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall"
@ -5536,9 +5507,12 @@
"propertyName": "type", "propertyName": "type",
"mapping": { "mapping": {
"message": "#/components/schemas/OpenAIResponseMessage", "message": "#/components/schemas/OpenAIResponseMessage",
"function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
"file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall",
"web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall", "web_search_call": "#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall",
"file_search_call": "#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall",
"function_call": "#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall",
"function_call_output": "#/components/schemas/OpenAIResponseInputFunctionToolCallOutput",
"mcp_approval_request": "#/components/schemas/OpenAIResponseMCPApprovalRequest",
"mcp_approval_response": "#/components/schemas/OpenAIResponseMCPApprovalResponse",
"mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall", "mcp_call": "#/components/schemas/OpenAIResponseOutputMessageMCPCall",
"mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools" "mcp_list_tools": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
} }
@ -5696,6 +5670,58 @@
} }
} }
}, },
"OpenAIResponseContentPartRefusal": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "refusal",
"default": "refusal",
"description": "Content part type identifier, always \"refusal\""
},
"refusal": {
"type": "string",
"description": "Refusal text supplied by the model"
}
},
"additionalProperties": false,
"required": [
"type",
"refusal"
],
"title": "OpenAIResponseContentPartRefusal",
"description": "Refusal content within a streamed response part."
},
"OpenAIResponseInputFunctionToolCallOutput": {
"type": "object",
"properties": {
"call_id": {
"type": "string"
},
"output": {
"type": "string"
},
"type": {
"type": "string",
"const": "function_call_output",
"default": "function_call_output"
},
"id": {
"type": "string"
},
"status": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"call_id",
"output",
"type"
],
"title": "OpenAIResponseInputFunctionToolCallOutput",
"description": "This represents the output of a function call that gets passed back to the model."
},
"OpenAIResponseInputMessageContent": { "OpenAIResponseInputMessageContent": {
"oneOf": [ "oneOf": [
{ {
@ -5775,6 +5801,68 @@
"title": "OpenAIResponseInputMessageContentText", "title": "OpenAIResponseInputMessageContentText",
"description": "Text content for input messages in OpenAI response format." "description": "Text content for input messages in OpenAI response format."
}, },
"OpenAIResponseMCPApprovalRequest": {
"type": "object",
"properties": {
"arguments": {
"type": "string"
},
"id": {
"type": "string"
},
"name": {
"type": "string"
},
"server_label": {
"type": "string"
},
"type": {
"type": "string",
"const": "mcp_approval_request",
"default": "mcp_approval_request"
}
},
"additionalProperties": false,
"required": [
"arguments",
"id",
"name",
"server_label",
"type"
],
"title": "OpenAIResponseMCPApprovalRequest",
"description": "A request for human approval of a tool invocation."
},
"OpenAIResponseMCPApprovalResponse": {
"type": "object",
"properties": {
"approval_request_id": {
"type": "string"
},
"approve": {
"type": "boolean"
},
"type": {
"type": "string",
"const": "mcp_approval_response",
"default": "mcp_approval_response"
},
"id": {
"type": "string"
},
"reason": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"approval_request_id",
"approve",
"type"
],
"title": "OpenAIResponseMCPApprovalResponse",
"description": "A response to an MCP approval request."
},
"OpenAIResponseMessage": { "OpenAIResponseMessage": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -5839,6 +5927,23 @@
"description": "Corresponds to the various Message types in the Responses API. They are all under one type because the Responses API gives them all the same \"type\" value, and there is no way to tell them apart in certain scenarios." "description": "Corresponds to the various Message types in the Responses API. They are all under one type because the Responses API gives them all the same \"type\" value, and there is no way to tell them apart in certain scenarios."
}, },
"OpenAIResponseOutputMessageContent": { "OpenAIResponseOutputMessageContent": {
"oneOf": [
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageContentOutputText"
},
{
"$ref": "#/components/schemas/OpenAIResponseContentPartRefusal"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"output_text": "#/components/schemas/OpenAIResponseOutputMessageContentOutputText",
"refusal": "#/components/schemas/OpenAIResponseContentPartRefusal"
}
}
},
"OpenAIResponseOutputMessageContentOutputText": {
"type": "object", "type": "object",
"properties": { "properties": {
"text": { "text": {
@ -6695,7 +6800,7 @@
"enum": [ "enum": [
"model", "model",
"shield", "shield",
"vector_db", "vector_store",
"dataset", "dataset",
"scoring_function", "scoring_function",
"benchmark", "benchmark",
@ -7250,41 +7355,17 @@
{ {
"$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse" "$ref": "#/components/schemas/OpenAIResponseMCPApprovalResponse"
}, },
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPCall"
},
{
"$ref": "#/components/schemas/OpenAIResponseOutputMessageMCPListTools"
},
{ {
"$ref": "#/components/schemas/OpenAIResponseMessage" "$ref": "#/components/schemas/OpenAIResponseMessage"
} }
] ]
}, },
"OpenAIResponseInputFunctionToolCallOutput": {
"type": "object",
"properties": {
"call_id": {
"type": "string"
},
"output": {
"type": "string"
},
"type": {
"type": "string",
"const": "function_call_output",
"default": "function_call_output"
},
"id": {
"type": "string"
},
"status": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"call_id",
"output",
"type"
],
"title": "OpenAIResponseInputFunctionToolCallOutput",
"description": "This represents the output of a function call that gets passed back to the model."
},
"OpenAIResponseInputToolFileSearch": { "OpenAIResponseInputToolFileSearch": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -7447,68 +7528,6 @@
"title": "OpenAIResponseInputToolWebSearch", "title": "OpenAIResponseInputToolWebSearch",
"description": "Web search tool configuration for OpenAI response inputs." "description": "Web search tool configuration for OpenAI response inputs."
}, },
"OpenAIResponseMCPApprovalRequest": {
"type": "object",
"properties": {
"arguments": {
"type": "string"
},
"id": {
"type": "string"
},
"name": {
"type": "string"
},
"server_label": {
"type": "string"
},
"type": {
"type": "string",
"const": "mcp_approval_request",
"default": "mcp_approval_request"
}
},
"additionalProperties": false,
"required": [
"arguments",
"id",
"name",
"server_label",
"type"
],
"title": "OpenAIResponseMCPApprovalRequest",
"description": "A request for human approval of a tool invocation."
},
"OpenAIResponseMCPApprovalResponse": {
"type": "object",
"properties": {
"approval_request_id": {
"type": "string"
},
"approve": {
"type": "boolean"
},
"type": {
"type": "string",
"const": "mcp_approval_response",
"default": "mcp_approval_response"
},
"id": {
"type": "string"
},
"reason": {
"type": "string"
}
},
"additionalProperties": false,
"required": [
"approval_request_id",
"approve",
"type"
],
"title": "OpenAIResponseMCPApprovalResponse",
"description": "A response to an MCP approval request."
},
"OpenAIResponseObjectWithInput": { "OpenAIResponseObjectWithInput": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -7581,6 +7600,10 @@
"$ref": "#/components/schemas/OpenAIResponseUsage", "$ref": "#/components/schemas/OpenAIResponseUsage",
"description": "(Optional) Token usage information for the response" "description": "(Optional) Token usage information for the response"
}, },
"instructions": {
"type": "string",
"description": "(Optional) System message inserted into the model's context"
},
"input": { "input": {
"type": "array", "type": "array",
"items": { "items": {
@ -7834,20 +7857,20 @@
"title": "OpenAIResponseUsage", "title": "OpenAIResponseUsage",
"description": "Usage information for OpenAI response." "description": "Usage information for OpenAI response."
}, },
"ResponseShieldSpec": { "ResponseGuardrailSpec": {
"type": "object", "type": "object",
"properties": { "properties": {
"type": { "type": {
"type": "string", "type": "string",
"description": "The type/identifier of the shield." "description": "The type/identifier of the guardrail."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"type" "type"
], ],
"title": "ResponseShieldSpec", "title": "ResponseGuardrailSpec",
"description": "Specification for a shield to apply during response generation." "description": "Specification for a guardrail to apply during response generation."
}, },
"OpenAIResponseInputTool": { "OpenAIResponseInputTool": {
"oneOf": [ "oneOf": [
@ -8129,6 +8152,10 @@
"usage": { "usage": {
"$ref": "#/components/schemas/OpenAIResponseUsage", "$ref": "#/components/schemas/OpenAIResponseUsage",
"description": "(Optional) Token usage information for the response" "description": "(Optional) Token usage information for the response"
},
"instructions": {
"type": "string",
"description": "(Optional) System message inserted into the model's context"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -8248,28 +8275,6 @@
"title": "OpenAIResponseContentPartReasoningText", "title": "OpenAIResponseContentPartReasoningText",
"description": "Reasoning text emitted as part of a streamed response." "description": "Reasoning text emitted as part of a streamed response."
}, },
"OpenAIResponseContentPartRefusal": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "refusal",
"default": "refusal",
"description": "Content part type identifier, always \"refusal\""
},
"refusal": {
"type": "string",
"description": "Refusal text supplied by the model"
}
},
"additionalProperties": false,
"required": [
"type",
"refusal"
],
"title": "OpenAIResponseContentPartRefusal",
"description": "Refusal content within a streamed response part."
},
"OpenAIResponseObjectStream": { "OpenAIResponseObjectStream": {
"oneOf": [ "oneOf": [
{ {
@ -10200,7 +10205,7 @@
"enum": [ "enum": [
"model", "model",
"shield", "shield",
"vector_db", "vector_store",
"dataset", "dataset",
"scoring_function", "scoring_function",
"benchmark", "benchmark",
@ -10682,7 +10687,7 @@
"enum": [ "enum": [
"model", "model",
"shield", "shield",
"vector_db", "vector_store",
"dataset", "dataset",
"scoring_function", "scoring_function",
"benchmark", "benchmark",
@ -11172,354 +11177,6 @@
"title": "SyntheticDataGenerationResponse", "title": "SyntheticDataGenerationResponse",
"description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold." "description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."
}, },
"Event": {
"oneOf": [
{
"$ref": "#/components/schemas/UnstructuredLogEvent"
},
{
"$ref": "#/components/schemas/MetricEvent"
},
{
"$ref": "#/components/schemas/StructuredLogEvent"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"unstructured_log": "#/components/schemas/UnstructuredLogEvent",
"metric": "#/components/schemas/MetricEvent",
"structured_log": "#/components/schemas/StructuredLogEvent"
}
}
},
"EventType": {
"type": "string",
"enum": [
"unstructured_log",
"structured_log",
"metric"
],
"title": "EventType",
"description": "The type of telemetry event being logged."
},
"LogSeverity": {
"type": "string",
"enum": [
"verbose",
"debug",
"info",
"warn",
"error",
"critical"
],
"title": "LogSeverity",
"description": "The severity level of a log message."
},
"MetricEvent": {
"type": "object",
"properties": {
"trace_id": {
"type": "string",
"description": "Unique identifier for the trace this event belongs to"
},
"span_id": {
"type": "string",
"description": "Unique identifier for the span this event belongs to"
},
"timestamp": {
"type": "string",
"format": "date-time",
"description": "Timestamp when the event occurred"
},
"attributes": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "string"
},
{
"type": "integer"
},
{
"type": "number"
},
{
"type": "boolean"
},
{
"type": "null"
}
]
},
"description": "(Optional) Key-value pairs containing additional metadata about the event"
},
"type": {
"$ref": "#/components/schemas/EventType",
"const": "metric",
"default": "metric",
"description": "Event type identifier set to METRIC"
},
"metric": {
"type": "string",
"description": "The name of the metric being measured"
},
"value": {
"oneOf": [
{
"type": "integer"
},
{
"type": "number"
}
],
"description": "The numeric value of the metric measurement"
},
"unit": {
"type": "string",
"description": "The unit of measurement for the metric value"
}
},
"additionalProperties": false,
"required": [
"trace_id",
"span_id",
"timestamp",
"type",
"metric",
"value",
"unit"
],
"title": "MetricEvent",
"description": "A metric event containing a measured value."
},
"SpanEndPayload": {
"type": "object",
"properties": {
"type": {
"$ref": "#/components/schemas/StructuredLogType",
"const": "span_end",
"default": "span_end",
"description": "Payload type identifier set to SPAN_END"
},
"status": {
"$ref": "#/components/schemas/SpanStatus",
"description": "The final status of the span indicating success or failure"
}
},
"additionalProperties": false,
"required": [
"type",
"status"
],
"title": "SpanEndPayload",
"description": "Payload for a span end event."
},
"SpanStartPayload": {
"type": "object",
"properties": {
"type": {
"$ref": "#/components/schemas/StructuredLogType",
"const": "span_start",
"default": "span_start",
"description": "Payload type identifier set to SPAN_START"
},
"name": {
"type": "string",
"description": "Human-readable name describing the operation this span represents"
},
"parent_span_id": {
"type": "string",
"description": "(Optional) Unique identifier for the parent span, if this is a child span"
}
},
"additionalProperties": false,
"required": [
"type",
"name"
],
"title": "SpanStartPayload",
"description": "Payload for a span start event."
},
"SpanStatus": {
"type": "string",
"enum": [
"ok",
"error"
],
"title": "SpanStatus",
"description": "The status of a span indicating whether it completed successfully or with an error."
},
"StructuredLogEvent": {
"type": "object",
"properties": {
"trace_id": {
"type": "string",
"description": "Unique identifier for the trace this event belongs to"
},
"span_id": {
"type": "string",
"description": "Unique identifier for the span this event belongs to"
},
"timestamp": {
"type": "string",
"format": "date-time",
"description": "Timestamp when the event occurred"
},
"attributes": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "string"
},
{
"type": "integer"
},
{
"type": "number"
},
{
"type": "boolean"
},
{
"type": "null"
}
]
},
"description": "(Optional) Key-value pairs containing additional metadata about the event"
},
"type": {
"$ref": "#/components/schemas/EventType",
"const": "structured_log",
"default": "structured_log",
"description": "Event type identifier set to STRUCTURED_LOG"
},
"payload": {
"oneOf": [
{
"$ref": "#/components/schemas/SpanStartPayload"
},
{
"$ref": "#/components/schemas/SpanEndPayload"
}
],
"discriminator": {
"propertyName": "type",
"mapping": {
"span_start": "#/components/schemas/SpanStartPayload",
"span_end": "#/components/schemas/SpanEndPayload"
}
},
"description": "The structured payload data for the log event"
}
},
"additionalProperties": false,
"required": [
"trace_id",
"span_id",
"timestamp",
"type",
"payload"
],
"title": "StructuredLogEvent",
"description": "A structured log event containing typed payload data."
},
"StructuredLogType": {
"type": "string",
"enum": [
"span_start",
"span_end"
],
"title": "StructuredLogType",
"description": "The type of structured log event payload."
},
"UnstructuredLogEvent": {
"type": "object",
"properties": {
"trace_id": {
"type": "string",
"description": "Unique identifier for the trace this event belongs to"
},
"span_id": {
"type": "string",
"description": "Unique identifier for the span this event belongs to"
},
"timestamp": {
"type": "string",
"format": "date-time",
"description": "Timestamp when the event occurred"
},
"attributes": {
"type": "object",
"additionalProperties": {
"oneOf": [
{
"type": "string"
},
{
"type": "integer"
},
{
"type": "number"
},
{
"type": "boolean"
},
{
"type": "null"
}
]
},
"description": "(Optional) Key-value pairs containing additional metadata about the event"
},
"type": {
"$ref": "#/components/schemas/EventType",
"const": "unstructured_log",
"default": "unstructured_log",
"description": "Event type identifier set to UNSTRUCTURED_LOG"
},
"message": {
"type": "string",
"description": "The log message text"
},
"severity": {
"$ref": "#/components/schemas/LogSeverity",
"description": "The severity level of the log message"
}
},
"additionalProperties": false,
"required": [
"trace_id",
"span_id",
"timestamp",
"type",
"message",
"severity"
],
"title": "UnstructuredLogEvent",
"description": "An unstructured log event containing a simple text message."
},
"LogEventRequest": {
"type": "object",
"properties": {
"event": {
"$ref": "#/components/schemas/Event",
"description": "The event to log."
},
"ttl_seconds": {
"type": "integer",
"description": "The time to live of the event."
}
},
"additionalProperties": false,
"required": [
"event",
"ttl_seconds"
],
"title": "LogEventRequest"
},
"InvokeToolRequest": { "InvokeToolRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -12083,7 +11740,7 @@
"enum": [ "enum": [
"model", "model",
"shield", "shield",
"vector_db", "vector_store",
"dataset", "dataset",
"scoring_function", "scoring_function",
"benchmark", "benchmark",
@ -13602,8 +13259,8 @@
}, },
{ {
"name": "Conversations", "name": "Conversations",
"description": "", "description": "Protocol for conversation management operations.",
"x-displayName": "Protocol for conversation management operations." "x-displayName": "Conversations"
}, },
{ {
"name": "Files", "name": "Files",
@ -13655,10 +13312,6 @@
"name": "SyntheticDataGeneration (Coming Soon)", "name": "SyntheticDataGeneration (Coming Soon)",
"description": "" "description": ""
}, },
{
"name": "Telemetry",
"description": ""
},
{ {
"name": "ToolGroups", "name": "ToolGroups",
"description": "" "description": ""
@ -13689,7 +13342,6 @@
"ScoringFunctions", "ScoringFunctions",
"Shields", "Shields",
"SyntheticDataGeneration (Coming Soon)", "SyntheticDataGeneration (Coming Soon)",
"Telemetry",
"ToolGroups", "ToolGroups",
"ToolRuntime", "ToolRuntime",
"VectorIO" "VectorIO"

View file

@ -192,7 +192,10 @@ paths:
tags: tags:
- Conversations - Conversations
summary: Create a conversation. summary: Create a conversation.
description: Create a conversation. description: >-
Create a conversation.
Create a conversation.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -222,8 +225,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Conversations - Conversations
summary: Get a conversation with the given ID. summary: Retrieve a conversation.
description: Get a conversation with the given ID. description: >-
Retrieve a conversation.
Get a conversation with the given ID.
parameters: parameters:
- name: conversation_id - name: conversation_id
in: path in: path
@ -252,9 +258,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Conversations - Conversations
summary: >- summary: Update a conversation.
Update a conversation's metadata with the given ID.
description: >- description: >-
Update a conversation.
Update a conversation's metadata with the given ID. Update a conversation's metadata with the given ID.
parameters: parameters:
- name: conversation_id - name: conversation_id
@ -290,8 +297,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Conversations - Conversations
summary: Delete a conversation with the given ID. summary: Delete a conversation.
description: Delete a conversation with the given ID. description: >-
Delete a conversation.
Delete a conversation with the given ID.
parameters: parameters:
- name: conversation_id - name: conversation_id
in: path in: path
@ -321,8 +331,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Conversations - Conversations
summary: List items in the conversation. summary: List items.
description: List items in the conversation. description: >-
List items.
List items in the conversation.
parameters: parameters:
- name: conversation_id - name: conversation_id
in: path in: path
@ -495,8 +508,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Conversations - Conversations
summary: Create items in the conversation. summary: Create items.
description: Create items in the conversation. description: >-
Create items.
Create items in the conversation.
parameters: parameters:
- name: conversation_id - name: conversation_id
in: path in: path
@ -532,8 +548,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Conversations - Conversations
summary: Retrieve a conversation item. summary: Retrieve an item.
description: Retrieve a conversation item. description: >-
Retrieve an item.
Retrieve a conversation item.
parameters: parameters:
- name: conversation_id - name: conversation_id
in: path in: path
@ -568,8 +587,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Conversations - Conversations
summary: Delete a conversation item. summary: Delete an item.
description: Delete a conversation item. description: >-
Delete an item.
Delete a conversation item.
parameters: parameters:
- name: conversation_id - name: conversation_id
in: path in: path
@ -1448,16 +1470,16 @@ paths:
required: true required: true
deprecated: false deprecated: false
x-llama-stack-extra-body-params: x-llama-stack-extra-body-params:
- name: shields - name: guardrails
schema: schema:
type: array type: array
items: items:
oneOf: oneOf:
- type: string - type: string
- $ref: '#/components/schemas/ResponseShieldSpec' - $ref: '#/components/schemas/ResponseGuardrailSpec'
description: >- description: >-
List of shields to apply during response generation. Shields provide safety List of guardrails to apply during response generation. Guardrails provide
and content moderation. safety and content moderation.
required: false required: false
/v1/responses/{response_id}: /v1/responses/{response_id}:
get: get:
@ -1944,33 +1966,6 @@ paths:
$ref: '#/components/schemas/SyntheticDataGenerateRequest' $ref: '#/components/schemas/SyntheticDataGenerateRequest'
required: true required: true
deprecated: false deprecated: false
/v1/telemetry/events:
post:
responses:
'200':
description: OK
'400':
$ref: '#/components/responses/BadRequest400'
'429':
$ref: >-
#/components/responses/TooManyRequests429
'500':
$ref: >-
#/components/responses/InternalServerError500
default:
$ref: '#/components/responses/DefaultError'
tags:
- Telemetry
summary: Log an event.
description: Log an event.
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/LogEventRequest'
required: true
deprecated: false
/v1/tool-runtime/invoke: /v1/tool-runtime/invoke:
post: post:
responses: responses:
@ -4180,18 +4175,24 @@ components:
ConversationItem: ConversationItem:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseMessage' - $ref: '#/components/schemas/OpenAIResponseMessage'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
- $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
- $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
- $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' - $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
discriminator: discriminator:
propertyName: type propertyName: type
mapping: mapping:
message: '#/components/schemas/OpenAIResponseMessage' message: '#/components/schemas/OpenAIResponseMessage'
function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall' web_search_call: '#/components/schemas/OpenAIResponseOutputMessageWebSearchToolCall'
file_search_call: '#/components/schemas/OpenAIResponseOutputMessageFileSearchToolCall'
function_call: '#/components/schemas/OpenAIResponseOutputMessageFunctionToolCall'
function_call_output: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
mcp_approval_request: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
mcp_approval_response: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall' mcp_call: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools' mcp_list_tools: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
OpenAIResponseAnnotationCitation: OpenAIResponseAnnotationCitation:
@ -4312,6 +4313,50 @@ components:
url_citation: '#/components/schemas/OpenAIResponseAnnotationCitation' url_citation: '#/components/schemas/OpenAIResponseAnnotationCitation'
container_file_citation: '#/components/schemas/OpenAIResponseAnnotationContainerFileCitation' container_file_citation: '#/components/schemas/OpenAIResponseAnnotationContainerFileCitation'
file_path: '#/components/schemas/OpenAIResponseAnnotationFilePath' file_path: '#/components/schemas/OpenAIResponseAnnotationFilePath'
OpenAIResponseContentPartRefusal:
type: object
properties:
type:
type: string
const: refusal
default: refusal
description: >-
Content part type identifier, always "refusal"
refusal:
type: string
description: Refusal text supplied by the model
additionalProperties: false
required:
- type
- refusal
title: OpenAIResponseContentPartRefusal
description: >-
Refusal content within a streamed response part.
"OpenAIResponseInputFunctionToolCallOutput":
type: object
properties:
call_id:
type: string
output:
type: string
type:
type: string
const: function_call_output
default: function_call_output
id:
type: string
status:
type: string
additionalProperties: false
required:
- call_id
- output
- type
title: >-
OpenAIResponseInputFunctionToolCallOutput
description: >-
This represents the output of a function call that gets passed back to the
model.
OpenAIResponseInputMessageContent: OpenAIResponseInputMessageContent:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseInputMessageContentText' - $ref: '#/components/schemas/OpenAIResponseInputMessageContentText'
@ -4370,6 +4415,53 @@ components:
title: OpenAIResponseInputMessageContentText title: OpenAIResponseInputMessageContentText
description: >- description: >-
Text content for input messages in OpenAI response format. Text content for input messages in OpenAI response format.
OpenAIResponseMCPApprovalRequest:
type: object
properties:
arguments:
type: string
id:
type: string
name:
type: string
server_label:
type: string
type:
type: string
const: mcp_approval_request
default: mcp_approval_request
additionalProperties: false
required:
- arguments
- id
- name
- server_label
- type
title: OpenAIResponseMCPApprovalRequest
description: >-
A request for human approval of a tool invocation.
OpenAIResponseMCPApprovalResponse:
type: object
properties:
approval_request_id:
type: string
approve:
type: boolean
type:
type: string
const: mcp_approval_response
default: mcp_approval_response
id:
type: string
reason:
type: string
additionalProperties: false
required:
- approval_request_id
- approve
- type
title: OpenAIResponseMCPApprovalResponse
description: A response to an MCP approval request.
OpenAIResponseMessage: OpenAIResponseMessage:
type: object type: object
properties: properties:
@ -4411,6 +4503,15 @@ components:
under one type because the Responses API gives them all the same "type" value, under one type because the Responses API gives them all the same "type" value,
and there is no way to tell them apart in certain scenarios. and there is no way to tell them apart in certain scenarios.
OpenAIResponseOutputMessageContent: OpenAIResponseOutputMessageContent:
oneOf:
- $ref: '#/components/schemas/OpenAIResponseOutputMessageContentOutputText'
- $ref: '#/components/schemas/OpenAIResponseContentPartRefusal'
discriminator:
propertyName: type
mapping:
output_text: '#/components/schemas/OpenAIResponseOutputMessageContentOutputText'
refusal: '#/components/schemas/OpenAIResponseContentPartRefusal'
"OpenAIResponseOutputMessageContentOutputText":
type: object type: object
properties: properties:
text: text:
@ -5126,7 +5227,7 @@ components:
enum: enum:
- model - model
- shield - shield
- vector_db - vector_store
- dataset - dataset
- scoring_function - scoring_function
- benchmark - benchmark
@ -5527,32 +5628,9 @@ components:
- $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput' - $ref: '#/components/schemas/OpenAIResponseInputFunctionToolCallOutput'
- $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest' - $ref: '#/components/schemas/OpenAIResponseMCPApprovalRequest'
- $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse' - $ref: '#/components/schemas/OpenAIResponseMCPApprovalResponse'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPCall'
- $ref: '#/components/schemas/OpenAIResponseOutputMessageMCPListTools'
- $ref: '#/components/schemas/OpenAIResponseMessage' - $ref: '#/components/schemas/OpenAIResponseMessage'
"OpenAIResponseInputFunctionToolCallOutput":
type: object
properties:
call_id:
type: string
output:
type: string
type:
type: string
const: function_call_output
default: function_call_output
id:
type: string
status:
type: string
additionalProperties: false
required:
- call_id
- output
- type
title: >-
OpenAIResponseInputFunctionToolCallOutput
description: >-
This represents the output of a function call that gets passed back to the
model.
OpenAIResponseInputToolFileSearch: OpenAIResponseInputToolFileSearch:
type: object type: object
properties: properties:
@ -5669,53 +5747,6 @@ components:
title: OpenAIResponseInputToolWebSearch title: OpenAIResponseInputToolWebSearch
description: >- description: >-
Web search tool configuration for OpenAI response inputs. Web search tool configuration for OpenAI response inputs.
OpenAIResponseMCPApprovalRequest:
type: object
properties:
arguments:
type: string
id:
type: string
name:
type: string
server_label:
type: string
type:
type: string
const: mcp_approval_request
default: mcp_approval_request
additionalProperties: false
required:
- arguments
- id
- name
- server_label
- type
title: OpenAIResponseMCPApprovalRequest
description: >-
A request for human approval of a tool invocation.
OpenAIResponseMCPApprovalResponse:
type: object
properties:
approval_request_id:
type: string
approve:
type: boolean
type:
type: string
const: mcp_approval_response
default: mcp_approval_response
id:
type: string
reason:
type: string
additionalProperties: false
required:
- approval_request_id
- approve
- type
title: OpenAIResponseMCPApprovalResponse
description: A response to an MCP approval request.
OpenAIResponseObjectWithInput: OpenAIResponseObjectWithInput:
type: object type: object
properties: properties:
@ -5784,6 +5815,10 @@ components:
$ref: '#/components/schemas/OpenAIResponseUsage' $ref: '#/components/schemas/OpenAIResponseUsage'
description: >- description: >-
(Optional) Token usage information for the response (Optional) Token usage information for the response
instructions:
type: string
description: >-
(Optional) System message inserted into the model's context
input: input:
type: array type: array
items: items:
@ -5961,18 +5996,18 @@ components:
- total_tokens - total_tokens
title: OpenAIResponseUsage title: OpenAIResponseUsage
description: Usage information for OpenAI response. description: Usage information for OpenAI response.
ResponseShieldSpec: ResponseGuardrailSpec:
type: object type: object
properties: properties:
type: type:
type: string type: string
description: The type/identifier of the shield. description: The type/identifier of the guardrail.
additionalProperties: false additionalProperties: false
required: required:
- type - type
title: ResponseShieldSpec title: ResponseGuardrailSpec
description: >- description: >-
Specification for a shield to apply during response generation. Specification for a guardrail to apply during response generation.
OpenAIResponseInputTool: OpenAIResponseInputTool:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch' - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -6187,6 +6222,10 @@ components:
$ref: '#/components/schemas/OpenAIResponseUsage' $ref: '#/components/schemas/OpenAIResponseUsage'
description: >- description: >-
(Optional) Token usage information for the response (Optional) Token usage information for the response
instructions:
type: string
description: >-
(Optional) System message inserted into the model's context
additionalProperties: false additionalProperties: false
required: required:
- created_at - created_at
@ -6278,25 +6317,6 @@ components:
title: OpenAIResponseContentPartReasoningText title: OpenAIResponseContentPartReasoningText
description: >- description: >-
Reasoning text emitted as part of a streamed response. Reasoning text emitted as part of a streamed response.
OpenAIResponseContentPartRefusal:
type: object
properties:
type:
type: string
const: refusal
default: refusal
description: >-
Content part type identifier, always "refusal"
refusal:
type: string
description: Refusal text supplied by the model
additionalProperties: false
required:
- type
- refusal
title: OpenAIResponseContentPartRefusal
description: >-
Refusal content within a streamed response part.
OpenAIResponseObjectStream: OpenAIResponseObjectStream:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated' - $ref: '#/components/schemas/OpenAIResponseObjectStreamResponseCreated'
@ -7899,7 +7919,7 @@ components:
enum: enum:
- model - model
- shield - shield
- vector_db - vector_store
- dataset - dataset
- scoring_function - scoring_function
- benchmark - benchmark
@ -8207,7 +8227,7 @@ components:
enum: enum:
- model - model
- shield - shield
- vector_db - vector_store
- dataset - dataset
- scoring_function - scoring_function
- benchmark - benchmark
@ -8565,267 +8585,6 @@ components:
description: >- description: >-
Response from the synthetic data generation. Batch of (prompt, response, score) Response from the synthetic data generation. Batch of (prompt, response, score)
tuples that pass the threshold. tuples that pass the threshold.
Event:
oneOf:
- $ref: '#/components/schemas/UnstructuredLogEvent'
- $ref: '#/components/schemas/MetricEvent'
- $ref: '#/components/schemas/StructuredLogEvent'
discriminator:
propertyName: type
mapping:
unstructured_log: '#/components/schemas/UnstructuredLogEvent'
metric: '#/components/schemas/MetricEvent'
structured_log: '#/components/schemas/StructuredLogEvent'
EventType:
type: string
enum:
- unstructured_log
- structured_log
- metric
title: EventType
description: >-
The type of telemetry event being logged.
LogSeverity:
type: string
enum:
- verbose
- debug
- info
- warn
- error
- critical
title: LogSeverity
description: The severity level of a log message.
MetricEvent:
type: object
properties:
trace_id:
type: string
description: >-
Unique identifier for the trace this event belongs to
span_id:
type: string
description: >-
Unique identifier for the span this event belongs to
timestamp:
type: string
format: date-time
description: Timestamp when the event occurred
attributes:
type: object
additionalProperties:
oneOf:
- type: string
- type: integer
- type: number
- type: boolean
- type: 'null'
description: >-
(Optional) Key-value pairs containing additional metadata about the event
type:
$ref: '#/components/schemas/EventType'
const: metric
default: metric
description: Event type identifier set to METRIC
metric:
type: string
description: The name of the metric being measured
value:
oneOf:
- type: integer
- type: number
description: >-
The numeric value of the metric measurement
unit:
type: string
description: >-
The unit of measurement for the metric value
additionalProperties: false
required:
- trace_id
- span_id
- timestamp
- type
- metric
- value
- unit
title: MetricEvent
description: >-
A metric event containing a measured value.
SpanEndPayload:
type: object
properties:
type:
$ref: '#/components/schemas/StructuredLogType'
const: span_end
default: span_end
description: Payload type identifier set to SPAN_END
status:
$ref: '#/components/schemas/SpanStatus'
description: >-
The final status of the span indicating success or failure
additionalProperties: false
required:
- type
- status
title: SpanEndPayload
description: Payload for a span end event.
SpanStartPayload:
type: object
properties:
type:
$ref: '#/components/schemas/StructuredLogType'
const: span_start
default: span_start
description: >-
Payload type identifier set to SPAN_START
name:
type: string
description: >-
Human-readable name describing the operation this span represents
parent_span_id:
type: string
description: >-
(Optional) Unique identifier for the parent span, if this is a child span
additionalProperties: false
required:
- type
- name
title: SpanStartPayload
description: Payload for a span start event.
SpanStatus:
type: string
enum:
- ok
- error
title: SpanStatus
description: >-
The status of a span indicating whether it completed successfully or with
an error.
StructuredLogEvent:
type: object
properties:
trace_id:
type: string
description: >-
Unique identifier for the trace this event belongs to
span_id:
type: string
description: >-
Unique identifier for the span this event belongs to
timestamp:
type: string
format: date-time
description: Timestamp when the event occurred
attributes:
type: object
additionalProperties:
oneOf:
- type: string
- type: integer
- type: number
- type: boolean
- type: 'null'
description: >-
(Optional) Key-value pairs containing additional metadata about the event
type:
$ref: '#/components/schemas/EventType'
const: structured_log
default: structured_log
description: >-
Event type identifier set to STRUCTURED_LOG
payload:
oneOf:
- $ref: '#/components/schemas/SpanStartPayload'
- $ref: '#/components/schemas/SpanEndPayload'
discriminator:
propertyName: type
mapping:
span_start: '#/components/schemas/SpanStartPayload'
span_end: '#/components/schemas/SpanEndPayload'
description: >-
The structured payload data for the log event
additionalProperties: false
required:
- trace_id
- span_id
- timestamp
- type
- payload
title: StructuredLogEvent
description: >-
A structured log event containing typed payload data.
StructuredLogType:
type: string
enum:
- span_start
- span_end
title: StructuredLogType
description: >-
The type of structured log event payload.
UnstructuredLogEvent:
type: object
properties:
trace_id:
type: string
description: >-
Unique identifier for the trace this event belongs to
span_id:
type: string
description: >-
Unique identifier for the span this event belongs to
timestamp:
type: string
format: date-time
description: Timestamp when the event occurred
attributes:
type: object
additionalProperties:
oneOf:
- type: string
- type: integer
- type: number
- type: boolean
- type: 'null'
description: >-
(Optional) Key-value pairs containing additional metadata about the event
type:
$ref: '#/components/schemas/EventType'
const: unstructured_log
default: unstructured_log
description: >-
Event type identifier set to UNSTRUCTURED_LOG
message:
type: string
description: The log message text
severity:
$ref: '#/components/schemas/LogSeverity'
description: The severity level of the log message
additionalProperties: false
required:
- trace_id
- span_id
- timestamp
- type
- message
- severity
title: UnstructuredLogEvent
description: >-
An unstructured log event containing a simple text message.
LogEventRequest:
type: object
properties:
event:
$ref: '#/components/schemas/Event'
description: The event to log.
ttl_seconds:
type: integer
description: The time to live of the event.
additionalProperties: false
required:
- event
- ttl_seconds
title: LogEventRequest
InvokeToolRequest: InvokeToolRequest:
type: object type: object
properties: properties:
@ -9231,7 +8990,7 @@ components:
enum: enum:
- model - model
- shield - shield
- vector_db - vector_store
- dataset - dataset
- scoring_function - scoring_function
- benchmark - benchmark
@ -10417,9 +10176,9 @@ tags:
- `background` - `background`
x-displayName: Agents x-displayName: Agents
- name: Conversations - name: Conversations
description: '' description: >-
x-displayName: >-
Protocol for conversation management operations. Protocol for conversation management operations.
x-displayName: Conversations
- name: Files - name: Files
description: >- description: >-
This API is used to upload documents that can be used with other Llama Stack This API is used to upload documents that can be used with other Llama Stack
@ -10465,8 +10224,6 @@ tags:
description: '' description: ''
- name: SyntheticDataGeneration (Coming Soon) - name: SyntheticDataGeneration (Coming Soon)
description: '' description: ''
- name: Telemetry
description: ''
- name: ToolGroups - name: ToolGroups
description: '' description: ''
- name: ToolRuntime - name: ToolRuntime
@ -10489,7 +10246,6 @@ x-tagGroups:
- ScoringFunctions - ScoringFunctions
- Shields - Shields
- SyntheticDataGeneration (Coming Soon) - SyntheticDataGeneration (Coming Soon)
- Telemetry
- ToolGroups - ToolGroups
- ToolRuntime - ToolRuntime
- VectorIO - VectorIO

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -78,17 +78,14 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
## Build, Configure, and Run Llama Stack ## Build, Configure, and Run Llama Stack
1. **Build the Llama Stack**: 1. **Install dependencies**:
Build the Llama Stack using the `starter` template:
```bash ```bash
uv run --with llama-stack llama stack build --distro starter --image-type venv llama stack list-deps starter | xargs -L1 uv pip install
``` ```
**Expected Output:**
2. **Start the distribution**:
```bash ```bash
... llama stack run starter
Build Successful!
You can find the newly-built template here: ~/.llama/distributions/starter/starter-run.yaml
You can run the new Llama Stack Distro via: uv run --with llama-stack llama stack run starter
``` ```
3. **Set the ENV variables by exporting them to the terminal**: 3. **Set the ENV variables by exporting them to the terminal**:

View file

@ -43,17 +43,17 @@ from .openai_responses import (
@json_schema_type @json_schema_type
class ResponseShieldSpec(BaseModel): class ResponseGuardrailSpec(BaseModel):
"""Specification for a shield to apply during response generation. """Specification for a guardrail to apply during response generation.
:param type: The type/identifier of the shield. :param type: The type/identifier of the guardrail.
""" """
type: str type: str
# TODO: more fields to be added for shield configuration # TODO: more fields to be added for guardrail configuration
ResponseShield = str | ResponseShieldSpec ResponseGuardrail = str | ResponseGuardrailSpec
class Attachment(BaseModel): class Attachment(BaseModel):
@ -820,10 +820,10 @@ class Agents(Protocol):
tools: list[OpenAIResponseInputTool] | None = None, tools: list[OpenAIResponseInputTool] | None = None,
include: list[str] | None = None, include: list[str] | None = None,
max_infer_iters: int | None = 10, # this is an extension to the OpenAI API max_infer_iters: int | None = 10, # this is an extension to the OpenAI API
shields: Annotated[ guardrails: Annotated[
list[ResponseShield] | None, list[ResponseGuardrail] | None,
ExtraBodyField( ExtraBodyField(
"List of shields to apply during response generation. Shields provide safety and content moderation." "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
), ),
] = None, ] = None,
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]: ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
@ -834,7 +834,7 @@ class Agents(Protocol):
:param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses. :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
:param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation. :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
:param include: (Optional) Additional fields to include in the response. :param include: (Optional) Additional fields to include in the response.
:param shields: (Optional) List of shields to apply during response generation. Can be shield IDs (strings) or shield specifications. :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
:returns: An OpenAIResponseObject. :returns: An OpenAIResponseObject.
""" """
... ...

View file

@ -131,8 +131,20 @@ class OpenAIResponseOutputMessageContentOutputText(BaseModel):
annotations: list[OpenAIResponseAnnotations] = Field(default_factory=list) annotations: list[OpenAIResponseAnnotations] = Field(default_factory=list)
@json_schema_type
class OpenAIResponseContentPartRefusal(BaseModel):
"""Refusal content within a streamed response part.
:param type: Content part type identifier, always "refusal"
:param refusal: Refusal text supplied by the model
"""
type: Literal["refusal"] = "refusal"
refusal: str
OpenAIResponseOutputMessageContent = Annotated[ OpenAIResponseOutputMessageContent = Annotated[
OpenAIResponseOutputMessageContentOutputText, OpenAIResponseOutputMessageContentOutputText | OpenAIResponseContentPartRefusal,
Field(discriminator="type"), Field(discriminator="type"),
] ]
register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMessageContent") register_schema(OpenAIResponseOutputMessageContent, name="OpenAIResponseOutputMessageContent")
@ -533,6 +545,7 @@ class OpenAIResponseObject(BaseModel):
:param tools: (Optional) An array of tools the model may call while generating a response. :param tools: (Optional) An array of tools the model may call while generating a response.
:param truncation: (Optional) Truncation strategy applied to the response :param truncation: (Optional) Truncation strategy applied to the response
:param usage: (Optional) Token usage information for the response :param usage: (Optional) Token usage information for the response
:param instructions: (Optional) System message inserted into the model's context
""" """
created_at: int created_at: int
@ -552,6 +565,7 @@ class OpenAIResponseObject(BaseModel):
tools: list[OpenAIResponseTool] | None = None tools: list[OpenAIResponseTool] | None = None
truncation: str | None = None truncation: str | None = None
usage: OpenAIResponseUsage | None = None usage: OpenAIResponseUsage | None = None
instructions: str | None = None
@json_schema_type @json_schema_type
@ -878,18 +892,6 @@ class OpenAIResponseContentPartOutputText(BaseModel):
logprobs: list[dict[str, Any]] | None = None logprobs: list[dict[str, Any]] | None = None
@json_schema_type
class OpenAIResponseContentPartRefusal(BaseModel):
"""Refusal content within a streamed response part.
:param type: Content part type identifier, always "refusal"
:param refusal: Refusal text supplied by the model
"""
type: Literal["refusal"] = "refusal"
refusal: str
@json_schema_type @json_schema_type
class OpenAIResponseContentPartReasoningText(BaseModel): class OpenAIResponseContentPartReasoningText(BaseModel):
"""Reasoning text emitted as part of a streamed response. """Reasoning text emitted as part of a streamed response.
@ -1258,9 +1260,9 @@ OpenAIResponseInput = Annotated[
| OpenAIResponseInputFunctionToolCallOutput | OpenAIResponseInputFunctionToolCallOutput
| OpenAIResponseMCPApprovalRequest | OpenAIResponseMCPApprovalRequest
| OpenAIResponseMCPApprovalResponse | OpenAIResponseMCPApprovalResponse
| | OpenAIResponseOutputMessageMCPCall
# Fallback to the generic message type as a last resort | OpenAIResponseOutputMessageMCPListTools
OpenAIResponseMessage, | OpenAIResponseMessage,
Field(union_mode="left_to_right"), Field(union_mode="left_to_right"),
] ]
register_schema(OpenAIResponseInput, name="OpenAIResponseInput") register_schema(OpenAIResponseInput, name="OpenAIResponseInput")

View file

@ -12,6 +12,9 @@ from openai.types.responses.response_includable import ResponseIncludable
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.agents.openai_responses import ( from llama_stack.apis.agents.openai_responses import (
OpenAIResponseInputFunctionToolCallOutput,
OpenAIResponseMCPApprovalRequest,
OpenAIResponseMCPApprovalResponse,
OpenAIResponseMessage, OpenAIResponseMessage,
OpenAIResponseOutputMessageFileSearchToolCall, OpenAIResponseOutputMessageFileSearchToolCall,
OpenAIResponseOutputMessageFunctionToolCall, OpenAIResponseOutputMessageFunctionToolCall,
@ -61,9 +64,14 @@ class ConversationMessage(BaseModel):
ConversationItem = Annotated[ ConversationItem = Annotated[
OpenAIResponseMessage OpenAIResponseMessage
| OpenAIResponseOutputMessageFunctionToolCall
| OpenAIResponseOutputMessageFileSearchToolCall
| OpenAIResponseOutputMessageWebSearchToolCall | OpenAIResponseOutputMessageWebSearchToolCall
| OpenAIResponseOutputMessageFileSearchToolCall
| OpenAIResponseOutputMessageFunctionToolCall
| OpenAIResponseInputFunctionToolCallOutput
| OpenAIResponseMCPApprovalRequest
| OpenAIResponseMCPApprovalResponse
| OpenAIResponseOutputMessageMCPCall
| OpenAIResponseOutputMessageMCPListTools
| OpenAIResponseOutputMessageMCPCall | OpenAIResponseOutputMessageMCPCall
| OpenAIResponseOutputMessageMCPListTools, | OpenAIResponseOutputMessageMCPListTools,
Field(discriminator="type"), Field(discriminator="type"),
@ -165,7 +173,9 @@ class ConversationItemDeletedResource(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @trace_protocol
class Conversations(Protocol): class Conversations(Protocol):
"""Protocol for conversation management operations.""" """Conversations
Protocol for conversation management operations."""
@webmethod(route="/conversations", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/conversations", method="POST", level=LLAMA_STACK_API_V1)
async def create_conversation( async def create_conversation(
@ -173,6 +183,8 @@ class Conversations(Protocol):
) -> Conversation: ) -> Conversation:
"""Create a conversation. """Create a conversation.
Create a conversation.
:param items: Initial items to include in the conversation context. :param items: Initial items to include in the conversation context.
:param metadata: Set of key-value pairs that can be attached to an object. :param metadata: Set of key-value pairs that can be attached to an object.
:returns: The created conversation object. :returns: The created conversation object.
@ -181,7 +193,9 @@ class Conversations(Protocol):
@webmethod(route="/conversations/{conversation_id}", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/conversations/{conversation_id}", method="GET", level=LLAMA_STACK_API_V1)
async def get_conversation(self, conversation_id: str) -> Conversation: async def get_conversation(self, conversation_id: str) -> Conversation:
"""Get a conversation with the given ID. """Retrieve a conversation.
Get a conversation with the given ID.
:param conversation_id: The conversation identifier. :param conversation_id: The conversation identifier.
:returns: The conversation object. :returns: The conversation object.
@ -190,7 +204,9 @@ class Conversations(Protocol):
@webmethod(route="/conversations/{conversation_id}", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/conversations/{conversation_id}", method="POST", level=LLAMA_STACK_API_V1)
async def update_conversation(self, conversation_id: str, metadata: Metadata) -> Conversation: async def update_conversation(self, conversation_id: str, metadata: Metadata) -> Conversation:
"""Update a conversation's metadata with the given ID. """Update a conversation.
Update a conversation's metadata with the given ID.
:param conversation_id: The conversation identifier. :param conversation_id: The conversation identifier.
:param metadata: Set of key-value pairs that can be attached to an object. :param metadata: Set of key-value pairs that can be attached to an object.
@ -200,7 +216,9 @@ class Conversations(Protocol):
@webmethod(route="/conversations/{conversation_id}", method="DELETE", level=LLAMA_STACK_API_V1) @webmethod(route="/conversations/{conversation_id}", method="DELETE", level=LLAMA_STACK_API_V1)
async def openai_delete_conversation(self, conversation_id: str) -> ConversationDeletedResource: async def openai_delete_conversation(self, conversation_id: str) -> ConversationDeletedResource:
"""Delete a conversation with the given ID. """Delete a conversation.
Delete a conversation with the given ID.
:param conversation_id: The conversation identifier. :param conversation_id: The conversation identifier.
:returns: The deleted conversation resource. :returns: The deleted conversation resource.
@ -209,7 +227,9 @@ class Conversations(Protocol):
@webmethod(route="/conversations/{conversation_id}/items", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/conversations/{conversation_id}/items", method="POST", level=LLAMA_STACK_API_V1)
async def add_items(self, conversation_id: str, items: list[ConversationItem]) -> ConversationItemList: async def add_items(self, conversation_id: str, items: list[ConversationItem]) -> ConversationItemList:
"""Create items in the conversation. """Create items.
Create items in the conversation.
:param conversation_id: The conversation identifier. :param conversation_id: The conversation identifier.
:param items: Items to include in the conversation context. :param items: Items to include in the conversation context.
@ -219,7 +239,9 @@ class Conversations(Protocol):
@webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/conversations/{conversation_id}/items/{item_id}", method="GET", level=LLAMA_STACK_API_V1)
async def retrieve(self, conversation_id: str, item_id: str) -> ConversationItem: async def retrieve(self, conversation_id: str, item_id: str) -> ConversationItem:
"""Retrieve a conversation item. """Retrieve an item.
Retrieve a conversation item.
:param conversation_id: The conversation identifier. :param conversation_id: The conversation identifier.
:param item_id: The item identifier. :param item_id: The item identifier.
@ -236,7 +258,9 @@ class Conversations(Protocol):
limit: int | NotGiven = NOT_GIVEN, limit: int | NotGiven = NOT_GIVEN,
order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN, order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,
) -> ConversationItemList: ) -> ConversationItemList:
"""List items in the conversation. """List items.
List items in the conversation.
:param conversation_id: The conversation identifier. :param conversation_id: The conversation identifier.
:param after: An item ID to list items after, used in pagination. :param after: An item ID to list items after, used in pagination.
@ -251,7 +275,9 @@ class Conversations(Protocol):
async def openai_delete_conversation_item( async def openai_delete_conversation_item(
self, conversation_id: str, item_id: str self, conversation_id: str, item_id: str
) -> ConversationItemDeletedResource: ) -> ConversationItemDeletedResource:
"""Delete a conversation item. """Delete an item.
Delete a conversation item.
:param conversation_id: The conversation identifier. :param conversation_id: The conversation identifier.
:param item_id: The item identifier. :param item_id: The item identifier.

View file

@ -121,6 +121,7 @@ class Api(Enum, metaclass=DynamicApiMeta):
models = "models" models = "models"
shields = "shields" shields = "shields"
vector_stores = "vector_stores" # only used for routing table
datasets = "datasets" datasets = "datasets"
scoring_functions = "scoring_functions" scoring_functions = "scoring_functions"
benchmarks = "benchmarks" benchmarks = "benchmarks"

View file

@ -82,7 +82,9 @@ class EvaluateResponse(BaseModel):
class Eval(Protocol): class Eval(Protocol):
"""Llama Stack Evaluation API for running evaluations on model and agent candidates.""" """Evaluations
Llama Stack Evaluation API for running evaluations on model and agent candidates."""
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1, deprecated=True) @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1ALPHA) @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST", level=LLAMA_STACK_API_V1ALPHA)

View file

@ -13,7 +13,7 @@ from pydantic import BaseModel, Field
class ResourceType(StrEnum): class ResourceType(StrEnum):
model = "model" model = "model"
shield = "shield" shield = "shield"
vector_db = "vector_db" vector_store = "vector_store"
dataset = "dataset" dataset = "dataset"
scoring_function = "scoring_function" scoring_function = "scoring_function"
benchmark = "benchmark" benchmark = "benchmark"
@ -34,4 +34,4 @@ class Resource(BaseModel):
provider_id: str = Field(description="ID of the provider that owns this resource") provider_id: str = Field(description="ID of the provider that owns this resource")
type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)") type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_store', etc.)")

Some files were not shown because too many files have changed in this diff Show more