Merge branch 'main' into toolcall-arg-recursive-type

This commit is contained in:
Ben Keith 2025-11-13 11:00:48 -05:00 committed by GitHub
commit 113cb4cd65
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
1665 changed files with 65332 additions and 619376 deletions

View file

@ -5,7 +5,7 @@ omit =
*/llama_stack/templates/* */llama_stack/templates/*
.venv/* .venv/*
*/llama_stack/cli/scripts/* */llama_stack/cli/scripts/*
*/llama_stack/ui/* */llama_stack_ui/*
*/llama_stack/distribution/ui/* */llama_stack/distribution/ui/*
*/llama_stack/strong_typing/* */llama_stack/strong_typing/*
*/llama_stack/env.py */llama_stack/env.py

View file

@ -72,7 +72,8 @@ runs:
echo "New recordings detected, committing and pushing" echo "New recordings detected, committing and pushing"
git add tests/integration/ git add tests/integration/
git commit -m "Recordings update from CI (suite: ${{ inputs.suite }})" git commit -m "Recordings update from CI (setup: ${{ inputs.setup }}, suite: ${{ inputs.suite }})"
git fetch origin ${{ github.ref_name }} git fetch origin ${{ github.ref_name }}
git rebase origin/${{ github.ref_name }} git rebase origin/${{ github.ref_name }}
echo "Rebased successfully" echo "Rebased successfully"
@ -88,6 +89,8 @@ runs:
run: | run: |
# Ollama logs (if ollama container exists) # Ollama logs (if ollama container exists)
sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
# vllm logs (if vllm container exists)
sudo docker logs vllm > vllm-${{ inputs.inference-mode }}.log 2>&1 || true
# Note: distro container logs are now dumped in integration-tests.sh before container is removed # Note: distro container logs are now dumped in integration-tests.sh before container is removed
- name: Upload logs - name: Upload logs

View file

@ -39,6 +39,32 @@ runs:
if: ${{ inputs.setup == 'vllm' && inputs.inference-mode == 'record' }} if: ${{ inputs.setup == 'vllm' && inputs.inference-mode == 'record' }}
uses: ./.github/actions/setup-vllm uses: ./.github/actions/setup-vllm
- name: Start Postgres service
if: ${{ contains(inputs.setup, 'postgres') }}
shell: bash
run: |
sudo docker rm -f postgres-ci || true
sudo docker run -d --name postgres-ci \
-e POSTGRES_USER=llamastack \
-e POSTGRES_PASSWORD=llamastack \
-e POSTGRES_DB=llamastack \
-p 5432:5432 \
postgres:16
echo "Waiting for Postgres to become ready..."
for i in {1..30}; do
if sudo docker exec postgres-ci pg_isready -U llamastack -d llamastack >/dev/null 2>&1; then
echo "Postgres is ready"
break
fi
if [ "$i" -eq 30 ]; then
echo "Postgres failed to start in time"
sudo docker logs postgres-ci || true
exit 1
fi
sleep 2
done
- name: Build Llama Stack - name: Build Llama Stack
shell: bash shell: bash
run: | run: |

View file

@ -11,13 +11,14 @@ runs:
--name vllm \ --name vllm \
-p 8000:8000 \ -p 8000:8000 \
--privileged=true \ --privileged=true \
quay.io/higginsd/vllm-cpu:65393ee064 \ quay.io/higginsd/vllm-cpu:65393ee064-qwen3 \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 8000 \ --port 8000 \
--enable-auto-tool-choice \ --enable-auto-tool-choice \
--tool-call-parser llama3_json \ --tool-call-parser hermes \
--model /root/.cache/Llama-3.2-1B-Instruct \ --model /root/.cache/Qwen3-0.6B \
--served-model-name meta-llama/Llama-3.2-1B-Instruct --served-model-name Qwen/Qwen3-0.6B \
--max-model-len 8192
# Wait for vllm to be ready # Wait for vllm to be ready
echo "Waiting for vllm to be ready..." echo "Waiting for vllm to be ready..."

View file

@ -22,7 +22,7 @@ updates:
prefix: chore(python-deps) prefix: chore(python-deps)
- package-ecosystem: npm - package-ecosystem: npm
directory: "/llama_stack/ui" directory: "/llama_stack_ui"
schedule: schedule:
interval: "weekly" interval: "weekly"
day: "saturday" day: "saturday"

View file

@ -18,6 +18,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
| Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project | | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
| Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration | | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |
| Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec | | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
| Stainless SDK Builds | [stainless-builds.yml](stainless-builds.yml) | Build Stainless SDK from OpenAPI spec changes |
| Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action | | Close stale issues and PRs | [stale_bot.yml](stale_bot.yml) | Run the Stale Bot action |
| Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module | | Test External Providers Installed via Module | [test-external-provider-module.yml](test-external-provider-module.yml) | Test External Provider installation via Python module |
| Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms | | Test External API and Providers | [test-external.yml](test-external.yml) | Test the External API and Provider mechanisms |

View file

@ -14,7 +14,7 @@ on:
paths: paths:
- 'distributions/**' - 'distributions/**'
- 'src/llama_stack/**' - 'src/llama_stack/**'
- '!src/llama_stack/ui/**' - '!src/llama_stack_ui/**'
- 'tests/integration/**' - 'tests/integration/**'
- 'uv.lock' - 'uv.lock'
- 'pyproject.toml' - 'pyproject.toml'

View file

@ -14,7 +14,7 @@ on:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: paths:
- 'src/llama_stack/**' - 'src/llama_stack/**'
- '!src/llama_stack/ui/**' - '!src/llama_stack_ui/**'
- 'tests/**' - 'tests/**'
- 'uv.lock' - 'uv.lock'
- 'pyproject.toml' - 'pyproject.toml'
@ -23,10 +23,10 @@ on:
- '.github/actions/setup-test-environment/action.yml' - '.github/actions/setup-test-environment/action.yml'
- '.github/actions/run-and-record-tests/action.yml' - '.github/actions/run-and-record-tests/action.yml'
- 'scripts/integration-tests.sh' - 'scripts/integration-tests.sh'
- 'scripts/generate_ci_matrix.py'
schedule: schedule:
# If changing the cron schedule, update the provider in the test-matrix job # If changing the cron schedule, update the provider in the test-matrix job
- cron: '0 0 * * *' # (test latest client) Daily at 12 AM UTC - cron: '0 0 * * *' # (test latest client) Daily at 12 AM UTC
- cron: '1 0 * * 0' # (test vllm) Weekly on Sunday at 1 AM UTC
workflow_dispatch: workflow_dispatch:
inputs: inputs:
test-all-client-versions: test-all-client-versions:
@ -44,36 +44,47 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
generate-matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Generate test matrix
id: set-matrix
run: |
# Generate matrix from CI_MATRIX in tests/integration/suites.py
# Supports schedule-based and manual input overrides
MATRIX=$(PYTHONPATH=. python3 scripts/generate_ci_matrix.py \
--schedule "${{ github.event.schedule }}" \
--test-setup "${{ github.event.inputs.test-setup }}")
echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
echo "Generated matrix: $MATRIX"
run-replay-mode-tests: run-replay-mode-tests:
needs: generate-matrix
runs-on: ubuntu-latest runs-on: ubuntu-latest
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }} name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
client-type: [library, docker, server] client: [library, docker, server]
# Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12 # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }} python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }} client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
# Define (setup, suite) pairs - they are always matched and cannot be independent # Test configurations: Generated from CI_MATRIX in tests/integration/suites.py
# Weekly schedule (Sun 1 AM): vllm+base # See scripts/generate_ci_matrix.py for generation logic
# Input test-setup=ollama-vision: ollama-vision+vision config: ${{ fromJSON(needs.generate-matrix.outputs.matrix).include }}
# Default (including test-setup=ollama): ollama+base, ollama-vision+vision, gpt+responses
config: >-
${{
github.event.schedule == '1 0 * * 0'
&& fromJSON('[{"setup": "vllm", "suite": "base"}]')
|| github.event.inputs.test-setup == 'ollama-vision'
&& fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
|| fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}, {"setup": "gpt", "suite": "responses"}]')
}}
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Setup test environment - name: Setup test environment
if: ${{ matrix.config.allowed_clients == null || contains(matrix.config.allowed_clients, matrix.client) }}
uses: ./.github/actions/setup-test-environment uses: ./.github/actions/setup-test-environment
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
@ -83,11 +94,16 @@ jobs:
inference-mode: 'replay' inference-mode: 'replay'
- name: Run tests - name: Run tests
if: ${{ matrix.config.allowed_clients == null || contains(matrix.config.allowed_clients, matrix.client) }}
uses: ./.github/actions/run-and-record-tests uses: ./.github/actions/run-and-record-tests
env: env:
OPENAI_API_KEY: dummy OPENAI_API_KEY: dummy
with: with:
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || matrix.client-type == 'server' && 'server:ci-tests' || 'docker:ci-tests' }} stack-config: >-
${{ matrix.config.stack_config
|| (matrix.client == 'library' && 'ci-tests')
|| (matrix.client == 'server' && 'server:ci-tests')
|| 'docker:ci-tests' }}
setup: ${{ matrix.config.setup }} setup: ${{ matrix.config.setup }}
inference-mode: 'replay' inference-mode: 'replay'
suite: ${{ matrix.config.suite }} suite: ${{ matrix.config.suite }}

View file

@ -13,7 +13,7 @@ on:
- 'release-[0-9]+.[0-9]+.x' - 'release-[0-9]+.[0-9]+.x'
paths: paths:
- 'src/llama_stack/**' - 'src/llama_stack/**'
- '!src/llama_stack/ui/**' - '!src/llama_stack_ui/**'
- 'tests/integration/vector_io/**' - 'tests/integration/vector_io/**'
- 'uv.lock' - 'uv.lock'
- 'pyproject.toml' - 'pyproject.toml'

View file

@ -43,14 +43,14 @@ jobs:
with: with:
node-version: '20' node-version: '20'
cache: 'npm' cache: 'npm'
cache-dependency-path: 'src/llama_stack/ui/' cache-dependency-path: 'src/llama_stack_ui/'
- name: Set up uv - name: Set up uv
uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2 uses: astral-sh/setup-uv@85856786d1ce8acfbcc2f13a5f3fbd6b938f9f41 # v7.1.2
- name: Install npm dependencies - name: Install npm dependencies
run: npm ci run: npm ci
working-directory: src/llama_stack/ui working-directory: src/llama_stack_ui
- name: Install pre-commit - name: Install pre-commit
run: python -m pip install pre-commit run: python -m pip install pre-commit
@ -165,3 +165,14 @@ jobs:
echo "::error::Full mypy failed. Reproduce locally with 'uv run pre-commit run mypy-full --hook-stage manual --all-files'." echo "::error::Full mypy failed. Reproduce locally with 'uv run pre-commit run mypy-full --hook-stage manual --all-files'."
fi fi
exit $status exit $status
- name: Check if any unused recordings
run: |
set -e
PYTHONPATH=$PWD uv run ./scripts/cleanup_recordings.py --delete
changes=$(git status --short tests/integration | grep 'recordings' || true)
if [ -n "$changes" ]; then
echo "::error::Unused integration recordings detected. Run 'PYTHONPATH=$(pwd) uv run ./scripts/cleanup_recordings.py --delete' locally and commit the deletions."
echo "$changes"
exit 1
fi

View file

@ -10,7 +10,7 @@ on:
branches: branches:
- main - main
paths-ignore: paths-ignore:
- 'src/llama_stack/ui/**' - 'src/llama_stack_ui/**'
jobs: jobs:
build: build:

110
.github/workflows/stainless-builds.yml vendored Normal file
View file

@ -0,0 +1,110 @@
name: Stainless SDK Builds
run-name: Build Stainless SDK from OpenAPI spec changes
# This workflow uses pull_request_target, which allows it to run on pull requests
# from forks with access to secrets. This is safe because the workflow definition
# comes from the base branch (trusted), and the action only reads OpenAPI spec
# files without executing any code from the PR.
on:
pull_request_target:
types:
- opened
- synchronize
- reopened
- closed
paths:
- "client-sdks/stainless/**"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true
env:
# Stainless organization name.
STAINLESS_ORG: llamastack
# Stainless project name.
STAINLESS_PROJECT: llama-stack-client
# Path to your OpenAPI spec.
OAS_PATH: ./client-sdks/stainless/openapi.yml
# Path to your Stainless config. Optional; only provide this if you prefer
# to maintain the ground truth Stainless config in your own repo.
CONFIG_PATH: ./client-sdks/stainless/config.yml
# When to fail the job based on build conclusion.
# Options: "never" | "note" | "warning" | "error" | "fatal".
FAIL_ON: error
# In your repo secrets, configure:
# - STAINLESS_API_KEY: a Stainless API key, which you can generate on the
# Stainless organization dashboard
jobs:
preview:
if: github.event.action != 'closed'
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
steps:
# Checkout the PR's code to access the OpenAPI spec and config files.
# This is necessary to read the spec/config from the PR (including from forks).
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.sha }}
fetch-depth: 2
# This action builds preview SDKs from the OpenAPI spec changes and
# posts/updates a comment on the PR with build results and links to the preview.
- name: Run preview builds
uses: stainless-api/upload-openapi-spec-action/preview@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
with:
stainless_api_key: ${{ secrets.STAINLESS_API_KEY }}
org: ${{ env.STAINLESS_ORG }}
project: ${{ env.STAINLESS_PROJECT }}
oas_path: ${{ env.OAS_PATH }}
config_path: ${{ env.CONFIG_PATH }}
fail_on: ${{ env.FAIL_ON }}
base_sha: ${{ github.event.pull_request.base.sha }}
base_ref: ${{ github.event.pull_request.base.ref }}
head_sha: ${{ github.event.pull_request.head.sha }}
merge:
if: github.event.action == 'closed' && github.event.pull_request.merged == true
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
steps:
# Checkout the PR's code to access the OpenAPI spec and config files.
# This is necessary to read the spec/config from the PR (including from forks).
- name: Checkout repository
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.sha }}
fetch-depth: 2
# Note that this only merges in changes that happened on the last build on
# preview/${{ github.head_ref }}. It's possible that there are OAS/config
# changes that haven't been built, if the preview-sdk job didn't finish
# before this step starts. In theory we want to wait for all builds
# against preview/${{ github.head_ref }} to complete, but assuming that
# the preview-sdk job happens before the PR merge, it should be fine.
- name: Run merge build
uses: stainless-api/upload-openapi-spec-action/merge@32823b096b4319c53ee948d702d9052873af485f # 1.6.0
with:
stainless_api_key: ${{ secrets.STAINLESS_API_KEY }}
org: ${{ env.STAINLESS_ORG }}
project: ${{ env.STAINLESS_PROJECT }}
oas_path: ${{ env.OAS_PATH }}
config_path: ${{ env.CONFIG_PATH }}
fail_on: ${{ env.FAIL_ON }}
base_sha: ${{ github.event.pull_request.base.sha }}
base_ref: ${{ github.event.pull_request.base.ref }}
head_sha: ${{ github.event.pull_request.head.sha }}

View file

@ -9,7 +9,7 @@ on:
branches: [ main ] branches: [ main ]
paths: paths:
- 'src/llama_stack/**' - 'src/llama_stack/**'
- '!src/llama_stack/ui/**' - '!src/llama_stack_ui/**'
- 'tests/integration/**' - 'tests/integration/**'
- 'uv.lock' - 'uv.lock'
- 'pyproject.toml' - 'pyproject.toml'

View file

@ -8,7 +8,7 @@ on:
pull_request: pull_request:
branches: [ main ] branches: [ main ]
paths: paths:
- 'src/llama_stack/ui/**' - 'src/llama_stack_ui/**'
- '.github/workflows/ui-unit-tests.yml' # This workflow - '.github/workflows/ui-unit-tests.yml' # This workflow
workflow_dispatch: workflow_dispatch:
@ -33,22 +33,22 @@ jobs:
with: with:
node-version: ${{ matrix.node-version }} node-version: ${{ matrix.node-version }}
cache: 'npm' cache: 'npm'
cache-dependency-path: 'src/llama_stack/ui/package-lock.json' cache-dependency-path: 'src/llama_stack_ui/package-lock.json'
- name: Install dependencies - name: Install dependencies
working-directory: src/llama_stack/ui working-directory: src/llama_stack_ui
run: npm ci run: npm ci
- name: Run linting - name: Run linting
working-directory: src/llama_stack/ui working-directory: src/llama_stack_ui
run: npm run lint run: npm run lint
- name: Run format check - name: Run format check
working-directory: src/llama_stack/ui working-directory: src/llama_stack_ui
run: npm run format:check run: npm run format:check
- name: Run unit tests - name: Run unit tests
working-directory: src/llama_stack/ui working-directory: src/llama_stack_ui
env: env:
CI: true CI: true

View file

@ -13,7 +13,7 @@ on:
- 'release-[0-9]+.[0-9]+.x' - 'release-[0-9]+.[0-9]+.x'
paths: paths:
- 'src/llama_stack/**' - 'src/llama_stack/**'
- '!src/llama_stack/ui/**' - '!src/llama_stack_ui/**'
- 'tests/unit/**' - 'tests/unit/**'
- 'uv.lock' - 'uv.lock'
- 'pyproject.toml' - 'pyproject.toml'

View file

@ -161,7 +161,7 @@ repos:
name: Format & Lint UI name: Format & Lint UI
entry: bash ./scripts/run-ui-linter.sh entry: bash ./scripts/run-ui-linter.sh
language: system language: system
files: ^src/llama_stack/ui/.*\.(ts|tsx)$ files: ^src/llama_stack_ui/.*\.(ts|tsx)$
pass_filenames: false pass_filenames: false
require_serial: true require_serial: true

View file

@ -1,8 +1,8 @@
These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless. These are the source-of-truth configuration files used to generate the Stainless client SDKs via Stainless.
- `openapi.yml`: this is the OpenAPI specification for the Llama Stack API. - `openapi.yml`: this is the OpenAPI specification for the Llama Stack API.
- `openapi.stainless.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs. - `config.yml`: this is the Stainless _configuration_ which instructs Stainless how to generate the client SDKs.
A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files. A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.
These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script. These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.

View file

@ -0,0 +1,527 @@
# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
organization:
# Name of your organization or company, used to determine the name of the client
# and headings.
name: llama-stack-client
docs: https://llama-stack.readthedocs.io/en/latest/
contact: llamastack@meta.com
security:
- {}
- BearerAuth: []
security_schemes:
BearerAuth:
type: http
scheme: bearer
# `targets` define the output targets and their customization options, such as
# whether to emit the Node SDK and what it's package name should be.
targets:
node:
package_name: llama-stack-client
production_repo: llamastack/llama-stack-client-typescript
publish:
npm: false
python:
package_name: llama_stack_client
production_repo: llamastack/llama-stack-client-python
options:
use_uv: true
publish:
pypi: true
project_name: llama_stack_client
kotlin:
reverse_domain: com.llama_stack_client.api
production_repo: null
publish:
maven: false
go:
package_name: llama-stack-client
production_repo: llamastack/llama-stack-client-go
options:
enable_v2: true
back_compat_use_shared_package: false
# `client_settings` define settings for the API client, such as extra constructor
# arguments (used for authentication), retry behavior, idempotency, etc.
client_settings:
default_env_prefix: LLAMA_STACK_CLIENT
opts:
api_key:
type: string
read_env: LLAMA_STACK_CLIENT_API_KEY
auth: { security_scheme: BearerAuth }
nullable: true
# `environments` are a map of the name of the environment (e.g. "sandbox",
# "production") to the corresponding url to use.
environments:
production: http://any-hosted-llama-stack.com
# `pagination` defines [pagination schemes] which provides a template to match
# endpoints and generate next-page and auto-pagination helpers in the SDKs.
pagination:
- name: datasets_iterrows
type: offset
request:
dataset_id:
type: string
start_index:
type: integer
x-stainless-pagination-property:
purpose: offset_count_param
limit:
type: integer
response:
data:
type: array
items:
type: object
next_index:
type: integer
x-stainless-pagination-property:
purpose: offset_count_start_field
- name: openai_cursor_page
type: cursor
request:
limit:
type: integer
after:
type: string
x-stainless-pagination-property:
purpose: next_cursor_param
response:
data:
type: array
items: {}
has_more:
type: boolean
last_id:
type: string
x-stainless-pagination-property:
purpose: next_cursor_field
# `resources` define the structure and organziation for your API, such as how
# methods and models are grouped together and accessed. See the [configuration
# guide] for more information.
#
# [configuration guide]:
# https://app.stainlessapi.com/docs/guides/configure#resources
resources:
$shared:
models:
interleaved_content_item: InterleavedContentItem
interleaved_content: InterleavedContent
param_type: ParamType
safety_violation: SafetyViolation
sampling_params: SamplingParams
scoring_result: ScoringResult
system_message: SystemMessage
query_result: RAGQueryResult
document: RAGDocument
query_config: RAGQueryConfig
toolgroups:
models:
tool_group: ToolGroup
list_tool_groups_response: ListToolGroupsResponse
methods:
register: post /v1/toolgroups
get: get /v1/toolgroups/{toolgroup_id}
list: get /v1/toolgroups
unregister: delete /v1/toolgroups/{toolgroup_id}
tools:
methods:
get: get /v1/tools/{tool_name}
list:
endpoint: get /v1/tools
paginated: false
tool_runtime:
models:
tool_def: ToolDef
tool_invocation_result: ToolInvocationResult
methods:
list_tools:
endpoint: get /v1/tool-runtime/list-tools
paginated: false
invoke_tool: post /v1/tool-runtime/invoke
subresources:
rag_tool:
methods:
insert: post /v1/tool-runtime/rag-tool/insert
query: post /v1/tool-runtime/rag-tool/query
responses:
models:
response_object_stream: OpenAIResponseObjectStream
response_object: OpenAIResponseObject
methods:
create:
type: http
endpoint: post /v1/responses
streaming:
stream_event_model: responses.response_object_stream
param_discriminator: stream
retrieve: get /v1/responses/{response_id}
list:
type: http
endpoint: get /v1/responses
delete:
type: http
endpoint: delete /v1/responses/{response_id}
subresources:
input_items:
methods:
list:
type: http
endpoint: get /v1/responses/{response_id}/input_items
prompts:
models:
prompt: Prompt
list_prompts_response: ListPromptsResponse
methods:
create: post /v1/prompts
list:
endpoint: get /v1/prompts
paginated: false
retrieve: get /v1/prompts/{prompt_id}
update: post /v1/prompts/{prompt_id}
delete: delete /v1/prompts/{prompt_id}
set_default_version: post /v1/prompts/{prompt_id}/set-default-version
subresources:
versions:
methods:
list:
endpoint: get /v1/prompts/{prompt_id}/versions
paginated: false
conversations:
models:
conversation_object: Conversation
methods:
create:
type: http
endpoint: post /v1/conversations
retrieve: get /v1/conversations/{conversation_id}
update:
type: http
endpoint: post /v1/conversations/{conversation_id}
delete:
type: http
endpoint: delete /v1/conversations/{conversation_id}
subresources:
items:
methods:
get:
type: http
endpoint: get /v1/conversations/{conversation_id}/items/{item_id}
list:
type: http
endpoint: get /v1/conversations/{conversation_id}/items
create:
type: http
endpoint: post /v1/conversations/{conversation_id}/items
inspect:
models:
healthInfo: HealthInfo
providerInfo: ProviderInfo
routeInfo: RouteInfo
versionInfo: VersionInfo
methods:
health: get /v1/health
version: get /v1/version
embeddings:
models:
create_embeddings_response: OpenAIEmbeddingsResponse
methods:
create: post /v1/embeddings
chat:
models:
chat_completion_chunk: OpenAIChatCompletionChunk
subresources:
completions:
methods:
create:
type: http
endpoint: post /v1/chat/completions
streaming:
stream_event_model: chat.chat_completion_chunk
param_discriminator: stream
list:
type: http
endpoint: get /v1/chat/completions
retrieve:
type: http
endpoint: get /v1/chat/completions/{completion_id}
completions:
methods:
create:
type: http
endpoint: post /v1/completions
streaming:
param_discriminator: stream
vector_io:
models:
queryChunksResponse: QueryChunksResponse
methods:
insert: post /v1/vector-io/insert
query: post /v1/vector-io/query
vector_stores:
models:
vector_store: VectorStoreObject
list_vector_stores_response: VectorStoreListResponse
vector_store_delete_response: VectorStoreDeleteResponse
vector_store_search_response: VectorStoreSearchResponsePage
methods:
create: post /v1/vector_stores
list:
endpoint: get /v1/vector_stores
retrieve: get /v1/vector_stores/{vector_store_id}
update: post /v1/vector_stores/{vector_store_id}
delete: delete /v1/vector_stores/{vector_store_id}
search: post /v1/vector_stores/{vector_store_id}/search
subresources:
files:
models:
vector_store_file: VectorStoreFileObject
methods:
list: get /v1/vector_stores/{vector_store_id}/files
retrieve: get /v1/vector_stores/{vector_store_id}/files/{file_id}
update: post /v1/vector_stores/{vector_store_id}/files/{file_id}
delete: delete /v1/vector_stores/{vector_store_id}/files/{file_id}
create: post /v1/vector_stores/{vector_store_id}/files
content: get /v1/vector_stores/{vector_store_id}/files/{file_id}/content
file_batches:
models:
vector_store_file_batches: VectorStoreFileBatchObject
list_vector_store_files_in_batch_response: VectorStoreFilesListInBatchResponse
methods:
create: post /v1/vector_stores/{vector_store_id}/file_batches
retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
models:
models:
model: OpenAIModel
list_models_response: OpenAIListModelsResponse
methods:
list:
endpoint: get /v1/models
paginated: false
retrieve: get /v1/models/{model_id}
register: post /v1/models
unregister: delete /v1/models/{model_id}
subresources:
openai:
methods:
list:
endpoint: get /v1/models
paginated: false
providers:
models:
list_providers_response: ListProvidersResponse
methods:
list:
endpoint: get /v1/providers
paginated: false
retrieve: get /v1/providers/{provider_id}
routes:
models:
list_routes_response: ListRoutesResponse
methods:
list:
endpoint: get /v1/inspect/routes
paginated: false
moderations:
models:
create_response: ModerationObject
methods:
create: post /v1/moderations
safety:
models:
run_shield_response: RunShieldResponse
methods:
run_shield: post /v1/safety/run-shield
shields:
models:
shield: Shield
list_shields_response: ListShieldsResponse
methods:
retrieve: get /v1/shields/{identifier}
list:
endpoint: get /v1/shields
paginated: false
register: post /v1/shields
delete: delete /v1/shields/{identifier}
scoring:
methods:
score: post /v1/scoring/score
score_batch: post /v1/scoring/score-batch
scoring_functions:
methods:
retrieve: get /v1/scoring-functions/{scoring_fn_id}
list:
endpoint: get /v1/scoring-functions
paginated: false
register: post /v1/scoring-functions
models:
scoring_fn: ScoringFn
scoring_fn_params: ScoringFnParams
list_scoring_functions_response: ListScoringFunctionsResponse
files:
methods:
create: post /v1/files
list: get /v1/files
retrieve: get /v1/files/{file_id}
delete: delete /v1/files/{file_id}
content: get /v1/files/{file_id}/content
models:
file: OpenAIFileObject
list_files_response: ListOpenAIFileResponse
delete_file_response: OpenAIFileDeleteResponse
alpha:
subresources:
inference:
methods:
rerank: post /v1alpha/inference/rerank
post_training:
models:
algorithm_config: AlgorithmConfig
post_training_job: PostTrainingJob
list_post_training_jobs_response: ListPostTrainingJobsResponse
methods:
preference_optimize: post /v1alpha/post-training/preference-optimize
supervised_fine_tune: post /v1alpha/post-training/supervised-fine-tune
subresources:
job:
methods:
artifacts: get /v1alpha/post-training/job/artifacts
cancel: post /v1alpha/post-training/job/cancel
status: get /v1alpha/post-training/job/status
list:
endpoint: get /v1alpha/post-training/jobs
paginated: false
benchmarks:
methods:
retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
list:
endpoint: get /v1alpha/eval/benchmarks
paginated: false
register: post /v1alpha/eval/benchmarks
models:
benchmark: Benchmark
list_benchmarks_response: ListBenchmarksResponse
eval:
methods:
evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
subresources:
jobs:
methods:
cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
models:
evaluate_response: EvaluateResponse
benchmark_config: BenchmarkConfig
job: Job
beta:
subresources:
datasets:
models:
list_datasets_response: ListDatasetsResponse
methods:
register: post /v1beta/datasets
retrieve: get /v1beta/datasets/{dataset_id}
list:
endpoint: get /v1beta/datasets
paginated: false
unregister: delete /v1beta/datasets/{dataset_id}
iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
settings:
license: MIT
unwrap_response_fields: [data]
file_header: |
Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.
This source code is licensed under the terms described in the LICENSE file in
the root directory of this source tree.
openapi:
transformations:
- command: mergeObject
reason: Better return_type using enum
args:
target:
- "$.components.schemas"
object:
ReturnType:
additionalProperties: false
properties:
type:
enum:
- string
- number
- boolean
- array
- object
- json
- union
- chat_completion_input
- completion_input
- agent_turn_input
required:
- type
type: object
- command: replaceProperties
reason: Replace return type properties with better model (see above)
args:
filter:
only:
- "$.components.schemas.ScoringFn.properties.return_type"
- "$.components.schemas.RegisterScoringFunctionRequest.properties.return_type"
value:
$ref: "#/components/schemas/ReturnType"
- command: oneOfToAnyOf
reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
# `readme` is used to configure the code snippets that will be rendered in the
# README.md of various SDKs. In particular, you can change the `headline`
# snippet's endpoint and the arguments to call it with.
readme:
example_requests:
default:
type: request
endpoint: post /v1/chat/completions
params: &ref_0 {}
headline:
type: request
endpoint: post /v1/models
params: *ref_0
pagination:
type: request
endpoint: post /v1/chat/completions
params: {}

File diff suppressed because it is too large Load diff

View file

@ -47,7 +47,7 @@ RUN set -eux; \
exit 1; \ exit 1; \
fi fi
RUN pip install --no-cache-dir uv RUN pip install --no-cache uv
ENV UV_SYSTEM_PYTHON=1 ENV UV_SYSTEM_PYTHON=1
ENV INSTALL_MODE=${INSTALL_MODE} ENV INSTALL_MODE=${INSTALL_MODE}
@ -72,7 +72,7 @@ RUN set -eux; \
echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \ echo "LLAMA_STACK_CLIENT_DIR is set but $LLAMA_STACK_CLIENT_DIR does not exist" >&2; \
exit 1; \ exit 1; \
fi; \ fi; \
uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"; \ uv pip install --no-cache -e "$LLAMA_STACK_CLIENT_DIR"; \
fi; fi;
# Install llama-stack # Install llama-stack
@ -88,22 +88,22 @@ RUN set -eux; \
fi; \ fi; \
if [ -n "$SAVED_UV_EXTRA_INDEX_URL" ] && [ -n "$SAVED_UV_INDEX_STRATEGY" ]; then \ if [ -n "$SAVED_UV_EXTRA_INDEX_URL" ] && [ -n "$SAVED_UV_INDEX_STRATEGY" ]; then \
UV_EXTRA_INDEX_URL="$SAVED_UV_EXTRA_INDEX_URL" UV_INDEX_STRATEGY="$SAVED_UV_INDEX_STRATEGY" \ UV_EXTRA_INDEX_URL="$SAVED_UV_EXTRA_INDEX_URL" UV_INDEX_STRATEGY="$SAVED_UV_INDEX_STRATEGY" \
uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \ uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
else \ else \
uv pip install --no-cache-dir -e "$LLAMA_STACK_DIR"; \ uv pip install --no-cache -e "$LLAMA_STACK_DIR"; \
fi; \ fi; \
elif [ "$INSTALL_MODE" = "test-pypi" ]; then \ elif [ "$INSTALL_MODE" = "test-pypi" ]; then \
uv pip install --no-cache-dir fastapi libcst; \ uv pip install --no-cache fastapi libcst; \
if [ -n "$TEST_PYPI_VERSION" ]; then \ if [ -n "$TEST_PYPI_VERSION" ]; then \
uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \ uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match "llama-stack==$TEST_PYPI_VERSION"; \
else \ else \
uv pip install --no-cache-dir --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \ uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ --index-strategy unsafe-best-match llama-stack; \
fi; \ fi; \
else \ else \
if [ -n "$PYPI_VERSION" ]; then \ if [ -n "$PYPI_VERSION" ]; then \
uv pip install --no-cache-dir "llama-stack==$PYPI_VERSION"; \ uv pip install --no-cache "llama-stack==$PYPI_VERSION"; \
else \ else \
uv pip install --no-cache-dir llama-stack; \ uv pip install --no-cache llama-stack; \
fi; \ fi; \
fi; fi;
@ -117,7 +117,7 @@ RUN set -eux; \
fi; \ fi; \
deps="$(llama stack list-deps "$DISTRO_NAME")"; \ deps="$(llama stack list-deps "$DISTRO_NAME")"; \
if [ -n "$deps" ]; then \ if [ -n "$deps" ]; then \
printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache-dir; \ printf '%s\n' "$deps" | xargs -L1 uv pip install --no-cache; \
fi fi
# Cleanup # Cleanup

View file

@ -0,0 +1,62 @@
---
title: Deprecated APIs
description: Legacy APIs that are being phased out
sidebar_label: Deprecated
sidebar_position: 1
---
# Deprecated APIs
This section contains APIs that are being phased out in favor of newer, more standardized implementations. These APIs are maintained for backward compatibility but are not recommended for new projects.
:::warning Deprecation Notice
These APIs are deprecated and will be removed in future versions. Please migrate to the recommended alternatives listed below.
:::
## Migration Guide
When using deprecated APIs, please refer to the migration guides provided for each API to understand how to transition to the supported alternatives.
## Deprecated API List
### Legacy Inference APIs
Some older inference endpoints that have been superseded by the standardized Inference API.
**Migration Path:** Use the [Inference API](../api/) instead.
### Legacy Vector Operations
Older vector database operations that have been replaced by the Vector IO API.
**Migration Path:** Use the [Vector IO API](../api/) instead.
### Legacy File Operations
Older file management endpoints that have been replaced by the Files API.
**Migration Path:** Use the [Files API](../api/) instead.
## Support Timeline
Deprecated APIs will be supported according to the following timeline:
- **Current Version**: Full support with deprecation warnings
- **Next Major Version**: Limited support with migration notices
- **Following Major Version**: Removal of deprecated APIs
## Getting Help
If you need assistance migrating from deprecated APIs:
1. Check the specific migration guides for each API
2. Review the [API Reference](../api/) for current alternatives
3. Consult the [Community Forums](https://github.com/llamastack/llama-stack/discussions) for migration support
4. Open an issue on GitHub for specific migration questions
## Contributing
If you find issues with deprecated APIs or have suggestions for improving the migration process, please contribute by:
1. Opening an issue describing the problem
2. Submitting a pull request with improvements
3. Updating migration documentation
For more information on contributing, see our [Contributing Guide](../contributing/).

View file

@ -0,0 +1,128 @@
---
title: Experimental APIs
description: APIs in development with limited support
sidebar_label: Experimental
sidebar_position: 1
---
# Experimental APIs
This section contains APIs that are currently in development and may have limited support or stability. These APIs are available for testing and feedback but should not be used in production environments.
:::warning Experimental Notice
These APIs are experimental and may change without notice. Use with caution and provide feedback to help improve them.
:::
## Current Experimental APIs
### Batch Inference API
Run inference on a dataset of inputs in batch mode for improved efficiency.
**Status:** In Development
**Provider Support:** Limited
**Use Case:** Large-scale inference operations
**Features:**
- Batch processing of multiple inputs
- Optimized resource utilization
- Progress tracking and monitoring
### Batch Agents API
Run agentic workflows on a dataset of inputs in batch mode.
**Status:** In Development
**Provider Support:** Limited
**Use Case:** Large-scale agent operations
**Features:**
- Batch agent execution
- Parallel processing capabilities
- Result aggregation and analysis
### Synthetic Data Generation API
Generate synthetic data for model development and testing.
**Status:** Early Development
**Provider Support:** Very Limited
**Use Case:** Training data augmentation
**Features:**
- Automated data generation
- Quality control mechanisms
- Customizable generation parameters
### Batches API (OpenAI-compatible)
OpenAI-compatible batch management for inference operations.
**Status:** In Development
**Provider Support:** Limited
**Use Case:** OpenAI batch processing compatibility
**Features:**
- OpenAI batch API compatibility
- Job scheduling and management
- Status tracking and monitoring
## Getting Started with Experimental APIs
### Prerequisites
- Llama Stack server running with experimental features enabled
- Appropriate provider configurations
- Understanding of API limitations
### Configuration
Experimental APIs may require special configuration flags or provider settings. Check the specific API documentation for setup requirements.
### Usage Guidelines
1. **Testing Only**: Use experimental APIs for testing and development only
2. **Monitor Changes**: Watch for updates and breaking changes
3. **Provide Feedback**: Report issues and suggest improvements
4. **Backup Data**: Always backup important data when using experimental features
## Feedback and Contribution
We encourage feedback on experimental APIs to help improve them:
### Reporting Issues
- Use GitHub issues with the "experimental" label
- Include detailed error messages and reproduction steps
- Specify the API version and provider being used
### Feature Requests
- Submit feature requests through GitHub discussions
- Provide use cases and expected behavior
- Consider contributing implementations
### Testing
- Test experimental APIs in your environment
- Report performance issues and optimization opportunities
- Share success stories and use cases
## Migration to Stable APIs
As experimental APIs mature, they will be moved to the stable API section. When this happens:
1. **Announcement**: We'll announce the promotion in release notes
2. **Migration Guide**: Detailed migration instructions will be provided
3. **Deprecation Timeline**: Experimental versions will be deprecated with notice
4. **Support**: Full support will be available for stable versions
## Provider Support
Experimental APIs may have limited provider support. Check the specific API documentation for:
- Supported providers
- Configuration requirements
- Known limitations
- Performance characteristics
## Roadmap
Experimental APIs are part of our ongoing development roadmap:
- **Q1 2024**: Batch Inference API stabilization
- **Q2 2024**: Batch Agents API improvements
- **Q3 2024**: Synthetic Data Generation API expansion
- **Q4 2024**: Batches API full OpenAI compatibility
For the latest updates, follow our [GitHub releases](https://github.com/llamastack/llama-stack/releases) and [roadmap discussions](https://github.com/llamastack/llama-stack/discussions).

View file

@ -0,0 +1,287 @@
---
title: OpenAI API Compatibility
description: OpenAI-compatible APIs and features in Llama Stack
sidebar_label: OpenAI Compatibility
sidebar_position: 1
---
# OpenAI API Compatibility
Llama Stack provides comprehensive OpenAI API compatibility, allowing you to use existing OpenAI API clients and tools with Llama Stack providers. This compatibility layer ensures seamless migration and interoperability.
## Overview
OpenAI API compatibility in Llama Stack includes:
- **OpenAI-compatible endpoints** for all major APIs
- **Request/response format compatibility** with OpenAI standards
- **Authentication and authorization** using OpenAI-style API keys
- **Error handling** with OpenAI-compatible error codes and messages
- **Rate limiting** and usage tracking compatible with OpenAI patterns
## Supported OpenAI APIs
### Chat Completions API
OpenAI-compatible chat completions for conversational AI applications.
**Endpoint:** `/v1/chat/completions`
**Compatibility:** Full OpenAI API compatibility
**Providers:** All inference providers
**Features:**
- Message-based conversations
- System prompts and user messages
- Function calling support
- Streaming responses
- Temperature and other parameter controls
### Completions API
OpenAI-compatible text completions for general text generation.
**Endpoint:** `/v1/completions`
**Compatibility:** Full OpenAI API compatibility
**Providers:** All inference providers
**Features:**
- Text completion generation
- Prompt engineering support
- Customizable parameters
- Batch processing capabilities
### Embeddings API
OpenAI-compatible embeddings for vector operations.
**Endpoint:** `/v1/embeddings`
**Compatibility:** Full OpenAI API compatibility
**Providers:** All embedding providers
**Features:**
- Text embedding generation
- Multiple embedding models
- Batch embedding processing
- Vector similarity operations
### Files API
OpenAI-compatible file management for document processing.
**Endpoint:** `/v1/files`
**Compatibility:** Full OpenAI API compatibility
**Providers:** Local Filesystem, S3
**Features:**
- File upload and management
- Document processing
- File metadata tracking
- Secure file access
### Vector Store Files API
OpenAI-compatible vector store file operations for RAG applications.
**Endpoint:** `/v1/vector_stores/{vector_store_id}/files`
**Compatibility:** Full OpenAI API compatibility
**Providers:** FAISS, SQLite-vec, Milvus, ChromaDB, Qdrant, Weaviate, Postgres (PGVector)
**Features:**
- Automatic document processing
- Vector store integration
- File chunking and indexing
- Search and retrieval operations
### Batches API
OpenAI-compatible batch processing for large-scale operations.
**Endpoint:** `/v1/batches`
**Compatibility:** OpenAI API compatibility (experimental)
**Providers:** Limited support
**Features:**
- Batch job creation and management
- Progress tracking
- Result retrieval
- Error handling
## Migration from OpenAI
### Step 1: Update API Endpoint
Change your API endpoint from OpenAI to your Llama Stack server:
```python
# Before (OpenAI)
import openai
client = openai.OpenAI(api_key="your-openai-key")
# After (Llama Stack)
import openai
client = openai.OpenAI(
api_key="your-llama-stack-key",
base_url="http://localhost:8000/v1" # Your Llama Stack server
)
```
### Step 2: Configure Providers
Set up your preferred providers in the Llama Stack configuration:
```yaml
# stack-config.yaml
inference:
providers:
- name: "meta-reference"
type: "inline"
model: "llama-3.1-8b"
```
### Step 3: Test Compatibility
Verify that your existing code works with Llama Stack:
```python
# Test chat completions
response = client.chat.completions.create(
model="llama-3.1-8b",
messages=[
{"role": "user", "content": "Hello, world!"}
]
)
print(response.choices[0].message.content)
```
## Provider-Specific Features
### Meta Reference Provider
- Full OpenAI API compatibility
- Local model execution
- Custom model support
### Remote Providers
- OpenAI API compatibility
- Cloud-based execution
- Scalable infrastructure
### Vector Store Providers
- OpenAI vector store API compatibility
- Automatic document processing
- Advanced search capabilities
## Authentication
Llama Stack supports OpenAI-style authentication:
### API Key Authentication
```python
client = openai.OpenAI(
api_key="your-api-key",
base_url="http://localhost:8000/v1"
)
```
### Environment Variables
```bash
export OPENAI_API_KEY="your-api-key"
export OPENAI_BASE_URL="http://localhost:8000/v1"
```
## Error Handling
Llama Stack provides OpenAI-compatible error responses:
```python
try:
response = client.chat.completions.create(...)
except openai.APIError as e:
print(f"API Error: {e}")
except openai.RateLimitError as e:
print(f"Rate Limit Error: {e}")
except openai.APIConnectionError as e:
print(f"Connection Error: {e}")
```
## Rate Limiting
OpenAI-compatible rate limiting is supported:
- **Requests per minute** limits
- **Tokens per minute** limits
- **Concurrent request** limits
- **Usage tracking** and monitoring
## Monitoring and Observability
Track your API usage with OpenAI-compatible monitoring:
- **Request/response logging**
- **Usage metrics** and analytics
- **Performance monitoring**
- **Error tracking** and alerting
## Best Practices
### 1. Provider Selection
Choose providers based on your requirements:
- **Local development**: Meta Reference, Ollama
- **Production**: Cloud providers (Fireworks, Together, NVIDIA)
- **Specialized use cases**: Custom providers
### 2. Model Configuration
Configure models for optimal performance:
- **Model selection** based on task requirements
- **Parameter tuning** for specific use cases
- **Resource allocation** for performance
### 3. Error Handling
Implement robust error handling:
- **Retry logic** for transient failures
- **Fallback providers** for high availability
- **Monitoring** and alerting for issues
### 4. Security
Follow security best practices:
- **API key management** and rotation
- **Access control** and authorization
- **Data privacy** and compliance
## Implementation Examples
For detailed code examples and implementation guides, see our [OpenAI Implementation Guide](../providers/openai.mdx).
## Known Limitations
### Responses API Limitations
The Responses API is still in active development. For detailed information about current limitations and implementation status, see our [OpenAI Responses API Limitations](../providers/openai_responses_limitations.mdx).
## Troubleshooting
### Common Issues
**Connection Errors**
- Verify server is running
- Check network connectivity
- Validate API endpoint URL
**Authentication Errors**
- Verify API key is correct
- Check key permissions
- Ensure proper authentication headers
**Model Errors**
- Verify model is available
- Check provider configuration
- Validate model parameters
### Getting Help
For OpenAI compatibility issues:
1. **Check Documentation**: Review provider-specific documentation
2. **Community Support**: Ask questions in GitHub discussions
3. **Issue Reporting**: Open GitHub issues for bugs
4. **Professional Support**: Contact support for enterprise issues
## Roadmap
Upcoming OpenAI compatibility features:
- **Enhanced batch processing** support
- **Advanced function calling** capabilities
- **Improved error handling** and diagnostics
- **Performance optimizations** for large-scale deployments
For the latest updates, follow our [GitHub releases](https://github.com/llamastack/llama-stack/releases) and [roadmap discussions](https://github.com/llamastack/llama-stack/discussions).

144
docs/docs/api/index.mdx Normal file
View file

@ -0,0 +1,144 @@
---
title: API Reference
description: Complete reference for Llama Stack APIs
sidebar_label: Overview
sidebar_position: 1
---
# API Reference
Llama Stack provides a comprehensive set of APIs for building generative AI applications. All APIs follow OpenAI-compatible standards and can be used interchangeably across different providers.
## Core APIs
### Inference API
Run inference with Large Language Models (LLMs) and embedding models.
**Supported Providers:**
- Meta Reference (Single Node)
- Ollama (Single Node)
- Fireworks (Hosted)
- Together (Hosted)
- NVIDIA NIM (Hosted and Single Node)
- vLLM (Hosted and Single Node)
- TGI (Hosted and Single Node)
- AWS Bedrock (Hosted)
- Cerebras (Hosted)
- Groq (Hosted)
- SambaNova (Hosted)
- PyTorch ExecuTorch (On-device iOS, Android)
- OpenAI (Hosted)
- Anthropic (Hosted)
- Gemini (Hosted)
- WatsonX (Hosted)
### Agents API
Run multi-step agentic workflows with LLMs, including tool usage, memory (RAG), and complex reasoning.
**Supported Providers:**
- Meta Reference (Single Node)
- Fireworks (Hosted)
- Together (Hosted)
- PyTorch ExecuTorch (On-device iOS)
### Vector IO API
Perform operations on vector stores, including adding documents, searching, and deleting documents.
**Supported Providers:**
- FAISS (Single Node)
- SQLite-Vec (Single Node)
- Chroma (Hosted and Single Node)
- Milvus (Hosted and Single Node)
- Postgres (PGVector) (Hosted and Single Node)
- Weaviate (Hosted)
- Qdrant (Hosted and Single Node)
### Files API (OpenAI-compatible)
Manage file uploads, storage, and retrieval with OpenAI-compatible endpoints.
**Supported Providers:**
- Local Filesystem (Single Node)
- S3 (Hosted)
### Vector Store Files API (OpenAI-compatible)
Integrate file operations with vector stores for automatic document processing and search.
**Supported Providers:**
- FAISS (Single Node)
- SQLite-vec (Single Node)
- Milvus (Single Node)
- ChromaDB (Hosted and Single Node)
- Qdrant (Hosted and Single Node)
- Weaviate (Hosted)
- Postgres (PGVector) (Hosted and Single Node)
### Safety API
Apply safety policies to outputs at a systems level, not just model level.
**Supported Providers:**
- Llama Guard (Depends on Inference Provider)
- Prompt Guard (Single Node)
- Code Scanner (Single Node)
- AWS Bedrock (Hosted)
### Post Training API
Fine-tune models for specific use cases and domains.
**Supported Providers:**
- Meta Reference (Single Node)
- HuggingFace (Single Node)
- TorchTune (Single Node)
- NVIDIA NEMO (Hosted)
### Eval API
Generate outputs and perform scoring to evaluate system performance.
**Supported Providers:**
- Meta Reference (Single Node)
- NVIDIA NEMO (Hosted)
### Telemetry API
Collect telemetry data from the system for monitoring and observability.
**Supported Providers:**
- Meta Reference (Single Node)
### Tool Runtime API
Interact with various tools and protocols to extend LLM capabilities.
**Supported Providers:**
- Brave Search (Hosted)
- RAG Runtime (Single Node)
## API Compatibility
All Llama Stack APIs are designed to be OpenAI-compatible, allowing you to:
- Use existing OpenAI API clients and tools
- Migrate from OpenAI to other providers seamlessly
- Maintain consistent API contracts across different environments
## Getting Started
To get started with Llama Stack APIs:
1. **Choose a Distribution**: Select a pre-configured distribution that matches your environment
2. **Configure Providers**: Set up the providers you want to use for each API
3. **Start the Server**: Launch the Llama Stack server with your configuration
4. **Use the APIs**: Make requests to the API endpoints using your preferred client
For detailed setup instructions, see our [Getting Started Guide](../getting_started/quickstart).
## Provider Details
For complete provider compatibility and setup instructions, see our [Providers Documentation](../providers/).
## API Stability
Llama Stack APIs are organized by stability level:
- **[Stable APIs](./index.mdx)** - Production-ready APIs with full support
- **[Experimental APIs](../api-experimental/)** - APIs in development with limited support
- **[Deprecated APIs](../api-deprecated/)** - Legacy APIs being phased out
## OpenAI Integration
For specific OpenAI API compatibility features, see our [OpenAI Compatibility Guide](../api-openai/).

View file

@ -35,9 +35,6 @@ Here are the key topics that will help you build effective AI applications:
- **[Telemetry](./telemetry.mdx)** - Monitor and analyze your agents' performance and behavior - **[Telemetry](./telemetry.mdx)** - Monitor and analyze your agents' performance and behavior
- **[Safety](./safety.mdx)** - Implement guardrails and safety measures to ensure responsible AI behavior - **[Safety](./safety.mdx)** - Implement guardrails and safety measures to ensure responsible AI behavior
### 🎮 **Interactive Development**
- **[Playground](./playground.mdx)** - Interactive environment for testing and developing applications
## Application Patterns ## Application Patterns
### 🤖 **Conversational Agents** ### 🤖 **Conversational Agents**

View file

@ -1,298 +0,0 @@
---
title: Llama Stack Playground
description: Interactive interface to explore and experiment with Llama Stack capabilities
sidebar_label: Playground
sidebar_position: 10
---
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Llama Stack Playground
:::note[Experimental Feature]
The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
:::
The Llama Stack Playground is a simple interface that aims to:
- **Showcase capabilities and concepts** of Llama Stack in an interactive environment
- **Demo end-to-end application code** to help users get started building their own applications
- **Provide a UI** to help users inspect and understand Llama Stack API providers and resources
## Key Features
### Interactive Playground Pages
The playground provides interactive pages for users to explore Llama Stack API capabilities:
#### Chatbot Interface
<video
controls
autoPlay
playsInline
muted
loop
style={{width: '100%'}}
>
<source src="https://github.com/user-attachments/assets/8d2ef802-5812-4a28-96e1-316038c84cbf" type="video/mp4" />
Your browser does not support the video tag.
</video>
<Tabs>
<TabItem value="chat" label="Chat">
**Simple Chat Interface**
- Chat directly with Llama models through an intuitive interface
- Uses the `/chat/completions` streaming API under the hood
- Real-time message streaming for responsive interactions
- Perfect for testing model capabilities and prompt engineering
</TabItem>
<TabItem value="rag" label="RAG Chat">
**Document-Aware Conversations**
- Upload documents to create memory banks
- Chat with a RAG-enabled agent that can query your documents
- Uses Llama Stack's `/agents` API to create and manage RAG sessions
- Ideal for exploring knowledge-enhanced AI applications
</TabItem>
</Tabs>
#### Evaluation Interface
<video
controls
autoPlay
playsInline
muted
loop
style={{width: '100%'}}
>
<source src="https://github.com/user-attachments/assets/6cc1659f-eba4-49ca-a0a5-7c243557b4f5" type="video/mp4" />
Your browser does not support the video tag.
</video>
<Tabs>
<TabItem value="scoring" label="Scoring Evaluations">
**Custom Dataset Evaluation**
- Upload your own evaluation datasets
- Run evaluations using available scoring functions
- Uses Llama Stack's `/scoring` API for flexible evaluation workflows
- Great for testing application performance on custom metrics
</TabItem>
<TabItem value="benchmarks" label="Benchmark Evaluations">
<video
controls
autoPlay
playsInline
muted
loop
style={{width: '100%', marginBottom: '1rem'}}
>
<source src="https://github.com/user-attachments/assets/345845c7-2a2b-4095-960a-9ae40f6a93cf" type="video/mp4" />
Your browser does not support the video tag.
</video>
**Pre-registered Evaluation Tasks**
- Evaluate models or agents on pre-defined tasks
- Uses Llama Stack's `/eval` API for comprehensive evaluation
- Combines datasets and scoring functions for standardized testing
**Setup Requirements:**
Register evaluation datasets and benchmarks first:
```bash
# Register evaluation dataset
llama-stack-client datasets register \
--dataset-id "mmlu" \
--provider-id "huggingface" \
--url "https://huggingface.co/datasets/llamastack/evals" \
--metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \
--schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string"}, "chat_completion_input": {"type": "string"}}'
# Register benchmark task
llama-stack-client benchmarks register \
--eval-task-id meta-reference-mmlu \
--provider-id meta-reference \
--dataset-id mmlu \
--scoring-functions basic::regex_parser_multiple_choice_answer
```
</TabItem>
</Tabs>
#### Inspection Interface
<video
controls
autoPlay
playsInline
muted
loop
style={{width: '100%'}}
>
<source src="https://github.com/user-attachments/assets/01d52b2d-92af-4e3a-b623-a9b8ba22ba99" type="video/mp4" />
Your browser does not support the video tag.
</video>
<Tabs>
<TabItem value="providers" label="API Providers">
**Provider Management**
- Inspect available Llama Stack API providers
- View provider configurations and capabilities
- Uses the `/providers` API for real-time provider information
- Essential for understanding your deployment's capabilities
</TabItem>
<TabItem value="resources" label="API Resources">
**Resource Exploration**
- Inspect Llama Stack API resources including:
- **Models**: Available language models
- **Datasets**: Registered evaluation datasets
- **Memory Banks**: Vector databases and knowledge stores
- **Benchmarks**: Evaluation tasks and scoring functions
- **Shields**: Safety and content moderation tools
- Uses `/<resources>/list` APIs for comprehensive resource visibility
- For detailed information about resources, see [Core Concepts](/docs/concepts)
</TabItem>
</Tabs>
## Getting Started
### Quick Start Guide
<Tabs>
<TabItem value="setup" label="Setup">
**1. Start the Llama Stack API Server**
```bash
llama stack list-deps together | xargs -L1 uv pip install
llama stack run together
```
**2. Start the Streamlit UI**
```bash
# Launch the playground interface
uv run --with ".[ui]" streamlit run llama_stack.core/ui/app.py
```
</TabItem>
<TabItem value="usage" label="Usage Tips">
**Making the Most of the Playground:**
- **Start with Chat**: Test basic model interactions and prompt engineering
- **Explore RAG**: Upload sample documents to see knowledge-enhanced responses
- **Try Evaluations**: Use the scoring interface to understand evaluation metrics
- **Inspect Resources**: Check what providers and resources are available
- **Experiment with Settings**: Adjust parameters to see how they affect results
</TabItem>
</Tabs>
### Available Distributions
The playground works with any Llama Stack distribution. Popular options include:
<Tabs>
<TabItem value="together" label="Together AI">
```bash
llama stack list-deps together | xargs -L1 uv pip install
llama stack run together
```
**Features:**
- Cloud-hosted models
- Fast inference
- Multiple model options
</TabItem>
<TabItem value="ollama" label="Ollama (Local)">
```bash
llama stack list-deps ollama | xargs -L1 uv pip install
llama stack run ollama
```
**Features:**
- Local model execution
- Privacy-focused
- No internet required
</TabItem>
<TabItem value="meta-reference" label="Meta Reference">
```bash
llama stack list-deps meta-reference | xargs -L1 uv pip install
llama stack run meta-reference
```
**Features:**
- Reference implementation
- All API features available
- Best for development
</TabItem>
</Tabs>
## Use Cases & Examples
### Educational Use Cases
- **Learning Llama Stack**: Hands-on exploration of API capabilities
- **Prompt Engineering**: Interactive testing of different prompting strategies
- **RAG Experimentation**: Understanding how document retrieval affects responses
- **Evaluation Understanding**: See how different metrics evaluate model performance
### Development Use Cases
- **Prototype Testing**: Quick validation of application concepts
- **API Exploration**: Understanding available endpoints and parameters
- **Integration Planning**: Seeing how different components work together
- **Demo Creation**: Showcasing Llama Stack capabilities to stakeholders
### Research Use Cases
- **Model Comparison**: Side-by-side testing of different models
- **Evaluation Design**: Understanding how scoring functions work
- **Safety Testing**: Exploring shield effectiveness with different inputs
- **Performance Analysis**: Measuring model behavior across different scenarios
## Best Practices
### 🚀 **Getting Started**
- Begin with simple chat interactions to understand basic functionality
- Gradually explore more advanced features like RAG and evaluations
- Use the inspection tools to understand your deployment's capabilities
### 🔧 **Development Workflow**
- Use the playground to prototype before writing application code
- Test different parameter settings interactively
- Validate evaluation approaches before implementing them programmatically
### 📊 **Evaluation & Testing**
- Start with simple scoring functions before trying complex evaluations
- Use the playground to understand evaluation results before automation
- Test safety features with various input types
### 🎯 **Production Preparation**
- Use playground insights to inform your production API usage
- Test edge cases and error conditions interactively
- Validate resource configurations before deployment
## Related Resources
- **[Getting Started Guide](../getting_started/quickstart)** - Complete setup and introduction
- **[Core Concepts](/docs/concepts)** - Understanding Llama Stack fundamentals
- **[Agents](./agent)** - Building intelligent agents
- **[RAG (Retrieval Augmented Generation)](./rag)** - Knowledge-enhanced applications
- **[Evaluations](./evals)** - Comprehensive evaluation framework
- **[API Reference](/docs/api/llama-stack-specification)** - Complete API documentation

View file

@ -7,7 +7,7 @@ sidebar_position: 1
# APIs # APIs
A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs: A Llama Stack API is described as a collection of REST endpoints following OpenAI API standards. We currently support the following APIs:
- **Inference**: run inference with a LLM - **Inference**: run inference with a LLM
- **Safety**: apply safety policies to the output at a Systems (not only model) level - **Safety**: apply safety policies to the output at a Systems (not only model) level
@ -16,11 +16,26 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
- **Scoring**: evaluate outputs of the system - **Scoring**: evaluate outputs of the system
- **Eval**: generate outputs (via Inference or Agents) and perform scoring - **Eval**: generate outputs (via Inference or Agents) and perform scoring
- **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents - **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
- **Files**: manage file uploads, storage, and retrieval
- **Telemetry**: collect telemetry data from the system
- **Post Training**: fine-tune a model - **Post Training**: fine-tune a model
- **Tool Runtime**: interact with various tools and protocols - **Tool Runtime**: interact with various tools and protocols
- **Responses**: generate responses from an LLM using this OpenAI compatible API. - **Responses**: generate responses from an LLM
We are working on adding a few more APIs to complete the application lifecycle. These will include: We are working on adding a few more APIs to complete the application lifecycle. These will include:
- **Batch Inference**: run inference on a dataset of inputs - **Batch Inference**: run inference on a dataset of inputs
- **Batch Agents**: run agents on a dataset of inputs - **Batch Agents**: run agents on a dataset of inputs
- **Batches**: OpenAI-compatible batch management for inference - **Batches**: OpenAI-compatible batch management for inference
## OpenAI API Compatibility
We are working on adding OpenAI API compatibility to Llama Stack. This will allow you to use Llama Stack with OpenAI API clients and tools.
### File Operations and Vector Store Integration
The Files API and Vector Store APIs work together through file operations, enabling automatic document processing and search. This integration implements the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files) and allows you to:
- Upload documents through the Files API
- Automatically process and chunk documents into searchable vectors
- Store processed content in vector databases based on the availability of [our providers](../../providers/index.mdx)
- Search through documents using natural language queries
For detailed information about this integration, see [File Operations and Vector Store Integration](../file_operations_vector_stores.md).

View file

@ -0,0 +1,420 @@
# File Operations and Vector Store Integration
## Overview
Llama Stack provides seamless integration between the Files API and Vector Store APIs, enabling you to upload documents and automatically process them into searchable vector embeddings. This integration implements file operations following the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files).
## Enhanced Capabilities Beyond OpenAI
While Llama Stack maintains full compatibility with OpenAI's Vector Store API, it provides several additional capabilities that enhance functionality and flexibility:
### **Embedding Model Specification**
Unlike OpenAI's vector stores which use a fixed embedding model, Llama Stack allows you to specify which embedding model to use when creating a vector store:
```python
# Create vector store with specific embedding model
vector_store = client.vector_stores.create(
name="my_documents",
embedding_model="all-MiniLM-L6-v2", # Specify your preferred model
embedding_dimension=384,
)
```
### **Advanced Search Modes**
Llama Stack supports multiple search modes beyond basic vector similarity:
- **Vector Search**: Pure semantic similarity search using embeddings
- **Keyword Search**: Traditional keyword-based search for exact matches
- **Hybrid Search**: Combines both vector and keyword search for optimal results
```python
# Different search modes
results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="machine learning algorithms",
search_mode="hybrid", # or "vector", "keyword"
max_num_results=5,
)
```
### **Flexible Ranking Options**
For hybrid search, Llama Stack offers configurable ranking strategies:
- **RRF (Reciprocal Rank Fusion)**: Combines rankings with configurable impact factor
- **Weighted Ranker**: Linear combination of vector and keyword scores with adjustable weights
```python
# Custom ranking configuration
results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="neural networks",
search_mode="hybrid",
ranking_options={
"ranker": {"type": "weighted", "alpha": 0.7} # 70% vector, 30% keyword
},
)
```
### **Provider Selection**
Choose from multiple vector store providers based on your specific needs:
- **Inline Providers**: FAISS (fast in-memory), SQLite-vec (disk-based), Milvus (high-performance)
- **Remote Providers**: ChromaDB, Qdrant, Weaviate, Postgres (PGVector), Milvus
```python
# Specify provider when creating vector store
vector_store = client.vector_stores.create(
name="my_documents", provider_id="sqlite-vec" # Choose your preferred provider
)
```
## How It Works
The file operations work through several key components:
1. **File Upload**: Documents are uploaded through the Files API
2. **Automatic Processing**: Files are automatically chunked and converted to embeddings
3. **Vector Storage**: Chunks are stored in vector databases with metadata
4. **Search & Retrieval**: Users can search through processed documents using natural language
## Supported Vector Store Providers
The following vector store providers support file operations:
### Inline Providers (Single Node)
- **FAISS**: Fast in-memory vector similarity search
- **SQLite-vec**: Disk-based storage with hybrid search capabilities
### Remote Providers (Hosted)
- **ChromaDB**: Vector database with metadata filtering
- **Weaviate**: Vector database with GraphQL interface
- **Postgres (PGVector)**: Vector extensions for PostgreSQL
### Both Inline & Remote Providers
- **Milvus**: High-performance vector database with advanced indexing
- **Qdrant**: Vector similarity search with payload filtering
## File Processing Pipeline
### 1. File Upload
```python
from llama_stack import LlamaStackClient
client = LlamaStackClient("http://localhost:8000")
# Upload a document
with open("document.pdf", "rb") as f:
file_info = await client.files.upload(file=f, purpose="assistants")
```
### 2. Attach to Vector Store
```python
# Create a vector store
vector_store = client.vector_stores.create(name="my_documents")
# Attach the file to the vector store
file_attach_response = await client.vector_stores.files.create(
vector_store_id=vector_store.id, file_id=file_info.id
)
```
### 3. Automatic Processing
The system automatically:
- Detects the file type and extracts text content
- Splits content into chunks (default: 800 tokens with 400 token overlap)
- Generates embeddings for each chunk
- Stores chunks with metadata in the vector store
- Updates file status to "completed"
### 4. Search and Retrieval
```python
# Search through processed documents
search_results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="What is the main topic discussed?",
max_num_results=5,
)
# Process results
for result in search_results.data:
print(f"Score: {result.score}")
for content in result.content:
print(f"Content: {content.text}")
```
## Supported File Types
The FileResponse system supports various document formats:
- **Text Files**: `.txt`, `.md`, `.rst`
- **Documents**: `.pdf`, `.docx`, `.doc`
- **Code**: `.py`, `.js`, `.java`, `.cpp`, etc.
- **Data**: `.json`, `.csv`, `.xml`
- **Web Content**: HTML files
## Chunking Strategies
### Default Strategy
The default chunking strategy uses:
- **Max Chunk Size**: 800 tokens
- **Overlap**: 400 tokens
- **Method**: Semantic boundary detection
### Custom Chunking
You can customize chunking when attaching files:
```python
from llama_stack.apis.vector_io import VectorStoreChunkingStrategy
# Attach file with custom chunking
file_attach_response = await client.vector_stores.files.create(
vector_store_id=vector_store.id,
file_id=file_info.id,
chunking_strategy=chunking_strategy,
)
```
**Note**: While Llama Stack is OpenAI-compatible, it also supports additional options beyond the standard OpenAI API. When creating vector stores, you can specify custom embedding models and embedding dimensions that will be used when processing chunks from attached files.
## File Management
### List Files in Vector Store
```python
# List all files in a vector store
files = await client.vector_stores.files.list(vector_store_id=vector_store.id)
for file in files:
print(f"File: {file.filename}, Status: {file.status}")
```
### File Status Tracking
Files go through several statuses:
- **in_progress**: File is being processed
- **completed**: File successfully processed and searchable
- **failed**: Processing failed (check `last_error` for details)
- **cancelled**: Processing was cancelled
### Retrieve File Content
```python
# Get chunked content from vector store
content_response = await client.vector_stores.files.retrieve_content(
vector_store_id=vector_store.id, file_id=file_info.id
)
for chunk in content_response.content:
print(f"Chunk {chunk.metadata.get('chunk_index', 0)}: {chunk.text}")
```
## Vector Store Management
### List Vector Stores
Retrieve a paginated list of all vector stores:
```python
# List all vector stores with default pagination
vector_stores = await client.vector_stores.list()
# Custom pagination and ordering
vector_stores = await client.vector_stores.list(
limit=10,
order="asc", # or "desc"
after="vs_12345678", # cursor-based pagination
)
for store in vector_stores.data:
print(f"Store: {store.name}, Files: {store.file_counts.total}")
print(f"Created: {store.created_at}, Status: {store.status}")
```
### Retrieve Vector Store Details
Get detailed information about a specific vector store:
```python
# Get vector store details
store_details = await client.vector_stores.retrieve(vector_store_id="vs_12345678")
print(f"Name: {store_details.name}")
print(f"Status: {store_details.status}")
print(f"File Counts: {store_details.file_counts}")
print(f"Usage: {store_details.usage_bytes} bytes")
print(f"Created: {store_details.created_at}")
print(f"Metadata: {store_details.metadata}")
```
### Update Vector Store
Modify vector store properties such as name, metadata, or expiration settings:
```python
# Update vector store name and metadata
updated_store = await client.vector_stores.update(
vector_store_id="vs_12345678",
name="Updated Document Collection",
metadata={
"description": "Updated collection for research",
"category": "research",
"version": "2.0",
},
)
# Set expiration policy
expired_store = await client.vector_stores.update(
vector_store_id="vs_12345678",
expires_after={"anchor": "last_active_at", "days": 30},
)
print(f"Updated store: {updated_store.name}")
print(f"Last active: {updated_store.last_active_at}")
```
### Delete Vector Store
Remove a vector store and all its associated data:
```python
# Delete a vector store
delete_response = await client.vector_stores.delete(vector_store_id="vs_12345678")
if delete_response.deleted:
print(f"Vector store {delete_response.id} successfully deleted")
else:
print("Failed to delete vector store")
```
**Important Notes:**
- Deleting a vector store removes all files, chunks, and embeddings
- This operation cannot be undone
- The underlying vector database is also cleaned up
- Consider backing up important data before deletion
## Search Capabilities
### Vector Search
Pure similarity search using embeddings:
```python
results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="machine learning algorithms",
max_num_results=10,
)
```
### Filtered Search
Combine vector search with metadata filtering:
```python
results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="machine learning algorithms",
filters={"file_type": "pdf", "upload_date": "2024-01-01"},
max_num_results=10,
)
```
### Hybrid Search
[SQLite-vec](../providers/vector_io/inline_sqlite-vec.mdx), [pgvector](../providers/vector_io/remote_pgvector.mdx), and [Milvus](../providers/vector_io/inline_milvus.mdx) support combining vector and keyword search.
## Performance Considerations
> **Note**: For detailed performance optimization strategies, see [Performance Considerations](../providers/files/openai_file_operations_support.md#performance-considerations) in the provider documentation.
**Key Points:**
- **Chunk Size**: 400-600 tokens for precision, 800-1200 for context
- **Storage**: Choose provider based on your performance needs
- **Search**: Optimize for your specific use case
## Error Handling
> **Note**: For comprehensive troubleshooting and error handling, see [Troubleshooting](../providers/files/openai_file_operations_support.md#troubleshooting) in the provider documentation.
**Common Issues:**
- File processing failures (format, size limits)
- Search performance optimization
- Storage and memory issues
## Best Practices
> **Note**: For detailed best practices and recommendations, see [Best Practices](../providers/files/openai_file_operations_support.md#best-practices) in the provider documentation.
**Key Recommendations:**
- File organization and naming conventions
- Chunking strategy optimization
- Metadata and monitoring practices
- Regular cleanup and maintenance
## Integration Examples
### RAG Application
```python
# Build a RAG system with file uploads
async def build_rag_system():
# Create vector store
vector_store = client.vector_stores.create(name="knowledge_base")
# Upload and process documents
documents = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
for doc in documents:
with open(doc, "rb") as f:
file_info = await client.files.create(file=f, purpose="assistants")
await client.vector_stores.files.create(
vector_store_id=vector_store.id, file_id=file_info.id
)
return vector_store
# Query the RAG system
async def query_rag(vector_store_id, question):
results = await client.vector_stores.search(
vector_store_id=vector_store_id, query=question, max_num_results=5
)
return results
```
### Document Analysis
```python
# Analyze document content through vector search
async def analyze_document(vector_store_id, file_id):
# Get document content
content = await client.vector_stores.files.retrieve_content(
vector_store_id=vector_store_id, file_id=file_id
)
# Search for specific topics
topics = ["introduction", "methodology", "conclusion"]
analysis = {}
for topic in topics:
results = await client.vector_stores.search(
vector_store_id=vector_store_id, query=topic, max_num_results=3
)
analysis[topic] = results.data
return analysis
```
## Next Steps
- Explore the [Files API documentation](../../providers/files/files.mdx) for detailed API reference
- Check [Vector Store Providers](../providers/vector_io/index.mdx) for specific implementation details
- Review [Getting Started](../getting_started/quickstart.mdx) for quick setup instructions

View file

@ -10,7 +10,7 @@ import TabItem from '@theme/TabItem';
# Kubernetes Deployment Guide # Kubernetes Deployment Guide
Deploy Llama Stack and vLLM servers in a Kubernetes cluster instead of running them locally. This guide covers both local development with Kind and production deployment on AWS EKS. Deploy Llama Stack and vLLM servers in a Kubernetes cluster instead of running them locally. This guide covers deployment using the Kubernetes operator to manage the Llama Stack server with Kind. The vLLM inference server is deployed manually.
## Prerequisites ## Prerequisites
@ -110,115 +110,176 @@ spec:
EOF EOF
``` ```
### Step 3: Configure Llama Stack ### Step 3: Install Kubernetes Operator
Update your run configuration: Install the Llama Stack Kubernetes operator to manage Llama Stack deployments:
```yaml
providers:
inference:
- provider_id: vllm
provider_type: remote::vllm
config:
url: http://vllm-server.default.svc.cluster.local:8000/v1
max_tokens: 4096
api_token: fake
```
Build container image:
```bash ```bash
tmp_dir=$(mktemp -d) && cat >$tmp_dir/Containerfile.llama-stack-run-k8s <<EOF # Install from the latest main branch
FROM distribution-myenv:dev kubectl apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/main/release/operator.yaml
RUN apt-get update && apt-get install -y git
RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-source # Or install a specific version (e.g., v0.4.0)
ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml # kubectl apply -f https://raw.githubusercontent.com/llamastack/llama-stack-k8s-operator/v0.4.0/release/operator.yaml
EOF
podman build -f $tmp_dir/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s $tmp_dir
``` ```
### Step 4: Deploy Llama Stack Server Verify the operator is running:
```bash
kubectl get pods -n llama-stack-operator-system
```
For more information about the operator, see the [llama-stack-k8s-operator repository](https://github.com/llamastack/llama-stack-k8s-operator).
### Step 4: Deploy Llama Stack Server using Operator
Create a `LlamaStackDistribution` custom resource to deploy the Llama Stack server. The operator will automatically create the necessary Deployment, Service, and other resources.
You can optionally override the default `run.yaml` using `spec.server.userConfig` with a ConfigMap (see [userConfig spec](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/docs/api-overview.md#userconfigspec)).
```yaml ```yaml
cat <<EOF | kubectl apply -f - cat <<EOF | kubectl apply -f -
apiVersion: v1 apiVersion: llamastack.io/v1alpha1
kind: PersistentVolumeClaim kind: LlamaStackDistribution
metadata: metadata:
name: llama-pvc name: llamastack-vllm
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-stack-server
spec: spec:
replicas: 1 replicas: 1
selector: server:
matchLabels: distribution:
app.kubernetes.io/name: llama-stack name: starter
template: containerSpec:
metadata: port: 8321
labels: env:
app.kubernetes.io/name: llama-stack - name: VLLM_URL
spec: value: "http://vllm-server.default.svc.cluster.local:8000/v1"
containers: - name: VLLM_MAX_TOKENS
- name: llama-stack value: "4096"
image: localhost/llama-stack-run-k8s:latest - name: VLLM_API_TOKEN
imagePullPolicy: IfNotPresent value: "fake"
command: ["llama", "stack", "run", "/app/config.yaml"] # Optional: override run.yaml from a ConfigMap using userConfig
ports: userConfig:
- containerPort: 5000 configMap:
volumeMounts: name: llama-stack-config
- name: llama-storage storage:
mountPath: /root/.llama size: "20Gi"
volumes: mountPath: "/home/lls/.lls"
- name: llama-storage
persistentVolumeClaim:
claimName: llama-pvc
---
apiVersion: v1
kind: Service
metadata:
name: llama-stack-service
spec:
selector:
app.kubernetes.io/name: llama-stack
ports:
- protocol: TCP
port: 5000
targetPort: 5000
type: ClusterIP
EOF EOF
``` ```
**Configuration Options:**
- `replicas`: Number of Llama Stack server instances to run
- `server.distribution.name`: The distribution to use (e.g., `starter` for the starter distribution). See the [list of supported distributions](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/distributions.json) in the operator repository.
- `server.distribution.image`: (Optional) Custom container image for non-supported distributions. Use this field when deploying a distribution that is not in the supported list. If specified, this takes precedence over `name`.
- `server.containerSpec.port`: Port on which the Llama Stack server listens (default: 8321)
- `server.containerSpec.env`: Environment variables to configure providers:
- `server.userConfig`: (Optional) Override the default `run.yaml` using a ConfigMap. See [userConfig spec](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/docs/api-overview.md#userconfigspec).
- `server.storage.size`: Size of the persistent volume for model and data storage
- `server.storage.mountPath`: Where to mount the storage in the container
**Note:** For a complete list of supported distributions, see [distributions.json](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/distributions.json) in the operator repository. To use a custom or non-supported distribution, set the `server.distribution.image` field with your container image instead of `server.distribution.name`.
The operator automatically creates:
- A Deployment for the Llama Stack server
- A Service to access the server
- A PersistentVolumeClaim for storage
- All necessary RBAC resources
Check the status of your deployment:
```bash
kubectl get llamastackdistribution
kubectl describe llamastackdistribution llamastack-vllm
```
### Step 5: Test Deployment ### Step 5: Test Deployment
Wait for the Llama Stack server pod to be ready:
```bash ```bash
# Port forward and test # Check the status of the LlamaStackDistribution
kubectl port-forward service/llama-stack-service 5000:5000 kubectl get llamastackdistribution llamastack-vllm
llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
# Check the pods created by the operator
kubectl get pods -l app.kubernetes.io/name=llama-stack
# Wait for the pod to be ready
kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=llama-stack --timeout=300s
```
Get the service name created by the operator (it typically follows the pattern `<llamastackdistribution-name>-service`):
```bash
# List services to find the service name
kubectl get services | grep llamastack
# Port forward and test (replace SERVICE_NAME with the actual service name)
kubectl port-forward service/llamastack-vllm-service 8321:8321
```
In another terminal, test the deployment:
```bash
llama-stack-client --endpoint http://localhost:8321 inference chat-completion --message "hello, what model are you?"
``` ```
## Troubleshooting ## Troubleshooting
**Check pod status:** ### vLLM Server Issues
**Check vLLM pod status:**
```bash ```bash
kubectl get pods -l app.kubernetes.io/name=vllm kubectl get pods -l app.kubernetes.io/name=vllm
kubectl logs -l app.kubernetes.io/name=vllm kubectl logs -l app.kubernetes.io/name=vllm
``` ```
**Test service connectivity:** **Test vLLM service connectivity:**
```bash ```bash
kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl http://vllm-server:8000/v1/models kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl http://vllm-server:8000/v1/models
``` ```
### Llama Stack Server Issues
**Check LlamaStackDistribution status:**
```bash
# Get detailed status
kubectl describe llamastackdistribution llamastack-vllm
# Check for events
kubectl get events --sort-by='.lastTimestamp' | grep llamastack-vllm
```
**Check operator-managed pods:**
```bash
# List all pods managed by the operator
kubectl get pods -l app.kubernetes.io/name=llama-stack
# Check pod logs (replace POD_NAME with actual pod name)
kubectl logs -l app.kubernetes.io/name=llama-stack
```
**Check operator status:**
```bash
# Verify the operator is running
kubectl get pods -n llama-stack-operator-system
# Check operator logs if issues persist
kubectl logs -n llama-stack-operator-system -l control-plane=controller-manager
```
**Verify service connectivity:**
```bash
# Get the service endpoint
kubectl get svc llamastack-vllm-service
# Test connectivity from within the cluster
kubectl run -it --rm debug --image=curlimages/curl --restart=Never -- curl http://llamastack-vllm-service:8321/health
```
## Related Resources ## Related Resources
- **[Deployment Overview](/docs/deploying/)** - Overview of deployment options - **[Deployment Overview](/docs/deploying/)** - Overview of deployment options
- **[Distributions](/docs/distributions)** - Understanding Llama Stack distributions - **[Distributions](/docs/distributions)** - Understanding Llama Stack distributions
- **[Configuration](/docs/distributions/configuration)** - Detailed configuration options - **[Configuration](/docs/distributions/configuration)** - Detailed configuration options
- **[LlamaStack Operator](https://github.com/llamastack/llama-stack-k8s-operator)** - Overview of llama-stack kubernetes operator
- **[LlamaStackDistribution](https://github.com/llamastack/llama-stack-k8s-operator/blob/main/docs/api-overview.md)** - API Spec of the llama-stack operator Custom Resource.

View file

@ -221,7 +221,15 @@ models:
``` ```
A Model is an instance of a "Resource" (see [Concepts](../concepts/)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to register models before using them, some Stack servers may come up a list of "already known and available" models. A Model is an instance of a "Resource" (see [Concepts](../concepts/)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to register models before using them, some Stack servers may come up a list of "already known and available" models.
What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`. What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. The `model_id` field is provided for configuration purposes but is not used as part of the model identifier.
**Important:** Models are identified as `provider_id/provider_model_id` in the system and when making API calls. When `provider_model_id` is omitted, the server will set it to be the same as `model_id`.
Examples:
- Config: `model_id: llama3.2`, `provider_id: ollama`, `provider_model_id: null`
→ Access as: `ollama/llama3.2`
- Config: `model_id: my-llama`, `provider_id: vllm-inference`, `provider_model_id: llama-3-2-3b`
→ Access as: `vllm-inference/llama-3-2-3b` (the `model_id` is not used in the identifier)
If you need to conditionally register a model in the configuration, such as only when specific environment variable(s) are set, this can be accomplished by utilizing a special `__disabled__` string as the default value of an environment variable substitution, as shown below: If you need to conditionally register a model in the configuration, such as only when specific environment variable(s) are set, this can be accomplished by utilizing a special `__disabled__` string as the default value of an environment variable substitution, as shown below:

View file

@ -11,7 +11,7 @@ If you are planning to use an external service for Inference (even Ollama or TGI
This avoids the overhead of setting up a server. This avoids the overhead of setting up a server.
```bash ```bash
# setup # setup
uv pip install llama-stack uv pip install llama-stack llama-stack-client
llama stack list-deps starter | xargs -L1 uv pip install llama stack list-deps starter | xargs -L1 uv pip install
``` ```

View file

@ -19,3 +19,4 @@ This section provides an overview of the distributions available in Llama Stack.
- **[Starting Llama Stack Server](./starting_llama_stack_server.mdx)** - How to run distributions - **[Starting Llama Stack Server](./starting_llama_stack_server.mdx)** - How to run distributions
- **[Importing as Library](./importing_as_library.mdx)** - Use distributions in your code - **[Importing as Library](./importing_as_library.mdx)** - Use distributions in your code
- **[Configuration Reference](./configuration.mdx)** - Configuration file format details - **[Configuration Reference](./configuration.mdx)** - Configuration file format details
- **[Llama Stack UI](./llama_stack_ui.mdx)** - Web-based user interface for interacting with Llama Stack servers

View file

@ -44,7 +44,7 @@ spec:
# Navigate to the UI directory # Navigate to the UI directory
echo "Navigating to UI directory..." echo "Navigating to UI directory..."
cd /app/llama_stack/ui cd /app/llama_stack_ui
# Check if package.json exists # Check if package.json exists
if [ ! -f "package.json" ]; then if [ ! -f "package.json" ]; then

View file

@ -0,0 +1,109 @@
---
title: Llama Stack UI
description: Web-based user interface for interacting with Llama Stack servers
sidebar_label: Llama Stack UI
sidebar_position: 8
---
# Llama Stack UI
The Llama Stack UI is a web-based interface for interacting with Llama Stack servers. Built with Next.js and React, it provides a visual way to work with agents, manage resources, and view logs.
## Features
- **Logs & Monitoring**: View chat completions, agent responses, and vector store activity
- **Vector Stores**: Create and manage vector databases for RAG (Retrieval-Augmented Generation) workflows
- **Prompt Management**: Create and manage reusable prompts
## Prerequisites
You need a running Llama Stack server. The UI is a client that connects to the Llama Stack backend.
If you don't have a Llama Stack server running yet, see the [Starting Llama Stack Server](../getting_started/starting_llama_stack_server.mdx) guide.
## Running the UI
### Option 1: Using npx (Recommended for Quick Start)
The fastest way to get started is using `npx`:
```bash
npx llama-stack-ui
```
This will start the UI server on `http://localhost:8322` (default port).
### Option 2: Using Docker
Run the UI in a container:
```bash
docker run -p 8322:8322 llamastack/ui
```
Access the UI at `http://localhost:8322`.
## Environment Variables
The UI can be configured using the following environment variables:
| Variable | Description | Default |
|----------|-------------|---------|
| `LLAMA_STACK_BACKEND_URL` | URL of your Llama Stack server | `http://localhost:8321` |
| `LLAMA_STACK_UI_PORT` | Port for the UI server | `8322` |
If the Llama Stack server is running with authentication enabled, you can configure the UI to use it by setting the following environment variables:
| Variable | Description | Default |
|----------|-------------|---------|
| `NEXTAUTH_URL` | NextAuth URL for authentication | `http://localhost:8322` |
| `GITHUB_CLIENT_ID` | GitHub OAuth client ID (optional, for authentication) | - |
| `GITHUB_CLIENT_SECRET` | GitHub OAuth client secret (optional, for authentication) | - |
### Setting Environment Variables
#### For npx:
```bash
LLAMA_STACK_BACKEND_URL=http://localhost:8321 \
LLAMA_STACK_UI_PORT=8080 \
npx llama-stack-ui
```
#### For Docker:
```bash
docker run -p 8080:8080 \
-e LLAMA_STACK_BACKEND_URL=http://localhost:8321 \
-e LLAMA_STACK_UI_PORT=8080 \
llamastack/ui
```
## Using the UI
### Managing Resources
- **Vector Stores**: Create vector databases for RAG workflows, view stored documents and embeddings
- **Prompts**: Create and manage reusable prompt templates
- **Chat Completions**: View history of chat interactions
- **Responses**: Browse detailed agent responses and tool calls
## Development
If you want to run the UI from source for development:
```bash
# From the project root
cd src/llama_stack_ui
# Install dependencies
npm install
# Set environment variables
export LLAMA_STACK_BACKEND_URL=http://localhost:8321
# Start the development server
npm run dev
```
The development server will start on `http://localhost:8322` with hot reloading enabled.

View file

@ -0,0 +1,143 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# OCI Distribution
The `llamastack/distribution-oci` distribution consists of the following provider configurations.
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| files | `inline::localfs` |
| inference | `remote::oci` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
### Environment Variables
The following environment variables can be configured:
- `OCI_AUTH_TYPE`: OCI authentication type (instance_principal or config_file) (default: `instance_principal`)
- `OCI_REGION`: OCI region (e.g., us-ashburn-1, us-chicago-1, us-phoenix-1, eu-frankfurt-1) (default: ``)
- `OCI_COMPARTMENT_OCID`: OCI compartment ID for the Generative AI service (default: ``)
- `OCI_CONFIG_FILE_PATH`: OCI config file path (required if OCI_AUTH_TYPE is config_file) (default: `~/.oci/config`)
- `OCI_CLI_PROFILE`: OCI CLI profile name to use from config file (default: `DEFAULT`)
## Prerequisites
### Oracle Cloud Infrastructure Setup
Before using the OCI Generative AI distribution, ensure you have:
1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
4. **Authentication**: Configure authentication using either:
- **Instance Principal** (recommended for cloud-hosted deployments)
- **API Key** (for on-premises or development environments)
### Authentication Methods
#### Instance Principal Authentication (Recommended)
Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
Requirements:
- Instance must be running in an Oracle Cloud Infrastructure compartment
- Instance must have appropriate IAM policies to access Generative AI services
#### API Key Authentication
For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
### Required IAM Policies
Ensure your OCI user or instance has the following policy statements:
```
Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
```
## Supported Services
### Inference: OCI Generative AI
Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
- **Chat Completions**: Conversational AI with context awareness
- **Text Generation**: Complete prompts and generate text content
#### Available Models
Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
### Safety: Llama Guard
For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
- Content filtering and moderation
- Policy compliance checking
- Harmful content detection
### Vector Storage: Multiple Options
The distribution supports several vector storage providers:
- **FAISS**: Local in-memory vector search
- **ChromaDB**: Distributed vector database
- **PGVector**: PostgreSQL with vector extensions
### Additional Services
- **Dataset I/O**: Local filesystem and Hugging Face integration
- **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
- **Evaluation**: Meta reference evaluation framework
## Running Llama Stack with OCI
You can run the OCI distribution via Docker or local virtual environment.
### Via venv
If you've set up your local development environment, you can also build the image using your local virtual environment.
```bash
OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
```
### Configuration Examples
#### Using Instance Principal (Recommended for Production)
```bash
export OCI_AUTH_TYPE=instance_principal
export OCI_REGION=us-chicago-1
export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
```
#### Using API Key Authentication (Development)
```bash
export OCI_AUTH_TYPE=config_file
export OCI_CONFIG_FILE_PATH=~/.oci/config
export OCI_CLI_PROFILE=DEFAULT
export OCI_REGION=us-chicago-1
export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
```
## Regional Endpoints
OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
## Troubleshooting
### Common Issues
1. **Authentication Errors**: Verify your OCI credentials and IAM policies
2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
3. **Permission Denied**: Check compartment permissions and Generative AI service access
4. **Region Unavailable**: Verify the specified region supports Generative AI services
### Getting Help
For additional support:
- [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
- [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)

View file

@ -163,7 +163,41 @@ docker run \
--port $LLAMA_STACK_PORT --port $LLAMA_STACK_PORT
``` ```
### Via venv The container will run the distribution with a SQLite store by default. This store is used for the following components:
- Metadata store: store metadata about the models, providers, etc.
- Inference store: collect of responses from the inference provider
- Agents store: store agent configurations (sessions, turns, etc.)
- Agents Responses store: store responses from the agents
However, you can use PostgreSQL instead by running the `starter::run-with-postgres-store.yaml` configuration:
```bash
docker run \
-it \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-e OPENAI_API_KEY=your_openai_key \
-e FIREWORKS_API_KEY=your_fireworks_key \
-e TOGETHER_API_KEY=your_together_key \
-e POSTGRES_HOST=your_postgres_host \
-e POSTGRES_PORT=your_postgres_port \
-e POSTGRES_DB=your_postgres_db \
-e POSTGRES_USER=your_postgres_user \
-e POSTGRES_PASSWORD=your_postgres_password \
llamastack/distribution-starter \
starter::run-with-postgres-store.yaml
```
Postgres environment variables:
- `POSTGRES_HOST`: Postgres host (default: `localhost`)
- `POSTGRES_PORT`: Postgres port (default: `5432`)
- `POSTGRES_DB`: Postgres database name (default: `llamastack`)
- `POSTGRES_USER`: Postgres username (default: `llamastack`)
- `POSTGRES_PASSWORD`: Postgres password (default: `llamastack`)
### Via Conda or venv
Ensure you have configured the starter distribution using the environment variables explained above. Ensure you have configured the starter distribution using the environment variables explained above.
@ -171,8 +205,11 @@ Ensure you have configured the starter distribution using the environment variab
# Install dependencies for the starter distribution # Install dependencies for the starter distribution
uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install uv run --with llama-stack llama stack list-deps starter | xargs -L1 uv pip install
# Run the server # Run the server (with SQLite - default)
uv run --with llama-stack llama stack run starter uv run --with llama-stack llama stack run starter
# Or run with PostgreSQL
uv run --with llama-stack llama stack run starter::run-with-postgres-store.yaml
``` ```
## Example Usage ## Example Usage

View file

@ -144,7 +144,7 @@ source .venv/bin/activate
```bash ```bash
uv venv client --python 3.12 uv venv client --python 3.12
source client/bin/activate source client/bin/activate
pip install llama-stack-client uv pip install llama-stack-client
``` ```
</TabItem> </TabItem>
</Tabs> </Tabs>

View file

@ -0,0 +1,290 @@
---
sidebar_label: Files
title: Files
---
## Overview
The Files API provides file management capabilities for Llama Stack. It allows you to upload, store, retrieve, and manage files that can be used across various endpoints in your application.
## Features
- **File Upload**: Upload files with metadata and purpose classification
- **File Management**: List, retrieve, and delete files
- **Content Retrieval**: Access raw file content for processing
- **API Compatibility**: Full compatibility with OpenAI Files API endpoints
- **Flexible Storage**: Support for local filesystem and cloud storage backends
## API Endpoints
### Upload File
**POST** `/v1/openai/v1/files`
Upload a file that can be used across various endpoints.
**Request Body:**
- `file`: The file object to be uploaded (multipart form data)
- `purpose`: The intended purpose of the uploaded file
**Supported Purposes:**
- `batch`: Files for batch operations
**Response:**
```json
{
"id": "file-abc123",
"object": "file",
"bytes": 140,
"created_at": 1613779121,
"filename": "mydata.jsonl",
"purpose": "batch"
}
```
**Example:**
```python
import requests
with open("data.jsonl", "rb") as f:
files = {"file": f}
data = {"purpose": "batch"}
response = requests.post(
"http://localhost:8000/v1/openai/v1/files", files=files, data=data
)
file_info = response.json()
```
### List Files
**GET** `/v1/openai/v1/files`
Returns a list of files that belong to the user's organization.
**Query Parameters:**
- `after` (optional): A cursor for pagination
- `limit` (optional): Limit on number of objects (1-10,000, default: 10,000)
- `order` (optional): Sort order by created_at timestamp (`asc` or `desc`, default: `desc`)
- `purpose` (optional): Filter files by purpose
**Response:**
```json
{
"object": "list",
"data": [
{
"id": "file-abc123",
"object": "file",
"bytes": 140,
"created_at": 1613779121,
"filename": "mydata.jsonl",
"purpose": "fine-tune"
}
],
"has_more": false
}
```
**Example:**
```python
import requests
# List all files
response = requests.get("http://localhost:8000/v1/openai/v1/files")
files = response.json()
# List files with pagination
response = requests.get(
"http://localhost:8000/v1/openAi/v1/files",
params={"limit": 10, "after": "file-abc123"},
)
files = response.json()
# Filter by purpose
response = requests.get(
"http://localhost:8000/v1/openAi/v1/files", params={"purpose": "fine-tune"}
)
files = response.json()
```
### Retrieve File
**GET** `/v1/openAi/v1/files/{file_id}`
Returns information about a specific file.
**Path Parameters:**
- `file_id`: The ID of the file to retrieve
**Response:**
```json
{
"id": "file-abc123",
"object": "file",
"bytes": 140,
"created_at": 1613779121,
"filename": "mydata.jsonl",
"purpose": "fine-tune"
}
```
**Example:**
```python
import requests
file_id = "file-abc123"
response = requests.get(f"http://localhost:8000/v1/openAi/v1/files/{file_id}")
file_info = response.json()
```
### Delete File
**DELETE** `/v1/openAi/v1/files/{file_id}`
Delete a file.
**Path Parameters:**
- `file_id`: The ID of the file to delete
**Response:**
```json
{
"id": "file-abc123",
"object": "file",
"deleted": true
}
```
**Example:**
```python
import requests
file_id = "file-abc123"
response = requests.delete(f"http://localhost:8000/v1/openAi/v1/files/{file_id}")
result = response.json()
```
### Retrieve File Content
**GET** `/v1/openAi/v1/files/{file_id}/content`
Returns the raw file content as a binary response.
**Path Parameters:**
- `file_id`: The ID of the file to retrieve content from
**Response:**
Binary file content with appropriate headers:
- `Content-Type`: `application/octet-stream`
- `Content-Disposition`: `attachment; filename="filename"`
**Example:**
```python
import requests
file_id = "file-abc123"
response = requests.get(f"http://localhost:8000/v1/openAi/v1/files/{file_id}/content")
# Save content to file
with open("downloaded_file.jsonl", "wb") as f:
f.write(response.content)
# Or process content directly
content = response.content
```
## Vector Store Integration
The Files API integrates with Vector Stores to enable document processing and search. For detailed information about this integration, see [File Operations and Vector Store Integration](../concepts/file_operations_vector_stores.md).
### Vector Store File Operations
**List Vector Store Files:**
- **GET** `/v1/openAi/v1/vector_stores/{vector_store_id}/files`
**Retrieve Vector Store File Content:**
- **GET** `/v1/openAi/v1/vector_stores/{vector_store_id}/files/{file_id}/content`
**Attach File to Vector Store:**
- **POST** `/v1/openAi/v1/vector_stores/{vector_store_id}/files`
## Error Handling
The Files API returns standard HTTP status codes and error responses:
- `400 Bad Request`: Invalid request parameters
- `404 Not Found`: File not found
- `429 Too Many Requests`: Rate limit exceeded
- `500 Internal Server Error`: Server error
**Error Response Format:**
```json
{
"error": {
"message": "Error description",
"type": "invalid_request_error",
"code": "file_not_found"
}
}
```
## Rate Limits
The Files API implements rate limiting to ensure fair usage:
- File uploads: 100 files per minute
- File retrievals: 1000 requests per minute
- File deletions: 100 requests per minute
## Best Practices
1. **File Organization**: Use descriptive filenames and appropriate purpose classifications
2. **Batch Operations**: For multiple files, consider using batch endpoints when available
3. **Error Handling**: Always check response status codes and handle errors gracefully
4. **Content Types**: Ensure files are uploaded with appropriate content types
5. **Cleanup**: Regularly delete unused files to manage storage costs
## Integration Examples
### With Python Client
```python
from llama_stack import LlamaStackClient
client = LlamaStackClient("http://localhost:8000")
# Upload a file
with open("data.jsonl", "rb") as f:
file_info = await client.files.upload(file=f, purpose="fine-tune")
# List files
files = await client.files.list(purpose="fine-tune")
# Retrieve file content
content = await client.files.retrieve_content(file_info.id)
```
### With cURL
```bash
# Upload file
curl -X POST http://localhost:8000/v1/openAi/v1/files \
-F "file=@data.jsonl" \
-F "purpose=fine-tune"
# List files
curl http://localhost:8000/v1/openAi/v1/files
# Download file content
curl http://localhost:8000/v1/openAi/v1/files/file-abc123/content \
-o downloaded_file.jsonl
```
## Provider Support
The Files API supports multiple storage backends:
- **Local Filesystem**: Store files on local disk (inline provider)
- **S3**: Store files in AWS S3 or S3-compatible services (remote provider)
- **Custom Backends**: Extensible architecture for custom storage providers
See the [Files Providers](index.md) documentation for detailed configuration options.

View file

@ -0,0 +1,80 @@
# File Operations Quick Reference
## Overview
As of release 0.2.14, Llama Stack provides comprehensive file operations and Vector Store API integration, following the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files).
> **Note**: For detailed overview and implementation details, see [Overview](../openai_file_operations_support.md#overview) in the full documentation.
## Supported Providers
> **Note**: For complete provider details and features, see [Supported Providers](../openai_file_operations_support.md#supported-providers) in the full documentation.
**Inline Providers**: FAISS, SQLite-vec, Milvus
**Remote Providers**: ChromaDB, Qdrant, Weaviate, PGVector
## Quick Start
### 1. Upload File
```python
file_info = await client.files.upload(
file=open("document.pdf", "rb"), purpose="assistants"
)
```
### 2. Create Vector Store
```python
vector_store = client.vector_stores.create(name="my_docs")
```
### 3. Attach File
```python
await client.vector_stores.files.create(
vector_store_id=vector_store.id, file_id=file_info.id
)
```
### 4. Search
```python
results = await client.vector_stores.search(
vector_store_id=vector_store.id, query="What is the main topic?", max_num_results=5
)
```
## File Processing & Search
**Processing**: 800 tokens default chunk size, 400 token overlap
**Formats**: PDF, DOCX, TXT, Code files, etc.
**Search**: Vector similarity, Hybrid (SQLite-vec), Filtered with metadata
## Configuration
> **Note**: For detailed configuration examples and options, see [Configuration Examples](../openai_file_operations_support.md#configuration-examples) in the full documentation.
**Basic Setup**: Configure vector_io and files providers in your run.yaml
## Common Use Cases
- **RAG Systems**: Document Q&A with file uploads
- **Knowledge Bases**: Searchable document collections
- **Content Analysis**: Document similarity and clustering
- **Research Tools**: Literature review and analysis
## Performance Tips
> **Note**: For detailed performance optimization strategies, see [Performance Considerations](../openai_file_operations_support.md#performance-considerations) in the full documentation.
**Quick Tips**: Choose provider based on your needs (speed vs. storage vs. scalability)
## Troubleshooting
> **Note**: For comprehensive troubleshooting, see [Troubleshooting](../openai_file_operations_support.md#troubleshooting) in the full documentation.
**Quick Fixes**: Check file format compatibility, optimize chunk sizes, monitor storage
## Resources
- [Full Documentation](openai_file_operations_support.md)
- [Integration Guide](../concepts/file_operations_vector_stores.md)
- [Files API](files_api.md)
- [Provider Details](../vector_io/index.md)

View file

@ -0,0 +1,291 @@
# File Operations Support in Vector Store Providers
## Overview
This document provides a comprehensive overview of file operations and Vector Store API support across all available vector store providers in Llama Stack. As of release 0.2.24, the following providers support full file operations integration.
## Supported Providers
### ✅ Full File Operations Support
The following providers support complete file operations integration, including file upload, automatic processing, and search:
#### Inline Providers (Single Node)
| Provider | File Operations | Key Features |
|----------|----------------|--------------|
| **FAISS** | ✅ Full Support | Fast in-memory search, GPU acceleration |
| **SQLite-vec** | ✅ Full Support | Hybrid search, disk-based storage |
| **Milvus** | ✅ Full Support | High-performance, scalable indexing |
#### Remote Providers (Hosted)
| Provider | File Operations | Key Features |
|----------|----------------|--------------|
| **ChromaDB** | ✅ Full Support | Metadata filtering, persistent storage |
| **Qdrant** | ✅ Full Support | Payload filtering, advanced search |
| **Weaviate** | ✅ Full Support | GraphQL interface, schema management |
| **Postgres (PGVector)** | ✅ Full Support | SQL integration, ACID compliance |
### 🔄 Partial Support
Some providers may support basic vector operations but lack full file operations integration:
| Provider | Status | Notes |
|----------|--------|-------|
| **Meta Reference** | 🔄 Basic | Core vector operations only |
## File Operations Features
All supported providers offer the following file operations capabilities:
### Core Functionality
- **File Upload & Processing**: Automatic document ingestion and chunking
- **Vector Storage**: Embedding generation and storage
- **Search & Retrieval**: Semantic search with metadata filtering
- **File Management**: List, retrieve, and manage files in vector stores
### Advanced Features
- **Automatic Chunking**: Configurable chunk sizes and overlap
- **Metadata Preservation**: File attributes and chunk metadata
- **Status Tracking**: Monitor file processing progress
- **Error Handling**: Comprehensive error reporting and recovery
## Implementation Details
### File Processing Pipeline
1. **Upload**: File uploaded via Files API
2. **Extraction**: Text content extracted from various formats
3. **Chunking**: Content split into optimal chunks (default: 800 tokens)
4. **Embedding**: Chunks converted to vector embeddings
5. **Storage**: Vectors stored with metadata in vector database
6. **Indexing**: Search index updated for fast retrieval
### Supported File Formats
- **Documents**: PDF, DOCX, DOC
- **Text**: TXT, MD, RST
- **Code**: Python, JavaScript, Java, C++, etc.
- **Data**: JSON, CSV, XML
- **Web**: HTML files
### Chunking Strategies
- **Default**: 800 tokens with 400 token overlap
- **Custom**: Configurable chunk sizes and overlap
- **Static**: Fixed-size chunks with overlap
## Provider-Specific Features
### FAISS
- **Storage**: In-memory with optional persistence
- **Performance**: Optimized for speed and GPU acceleration
- **Use Case**: High-performance, memory-constrained environments
### SQLite-vec
- **Storage**: Disk-based with SQLite backend
- **Search**: Hybrid vector + keyword search
- **Use Case**: Large document collections, frequent updates
### Milvus
- **Storage**: Scalable distributed storage
- **Indexing**: Multiple index types (IVF, HNSW)
- **Use Case**: Production deployments, large-scale applications
### ChromaDB
- **Storage**: Persistent storage with metadata
- **Filtering**: Advanced metadata filtering
- **Use Case**: Applications requiring rich metadata
### Qdrant
- **Storage**: High-performance vector database
- **Filtering**: Payload-based filtering
- **Use Case**: Real-time applications, complex queries
### Weaviate
- **Storage**: GraphQL-native vector database
- **Schema**: Flexible schema management
- **Use Case**: Applications requiring complex data relationships
### Postgres (PGVector)
- **Storage**: SQL database with vector extensions
- **Integration**: ACID compliance, existing SQL workflows
- **Use Case**: Applications requiring transactional guarantees
## Configuration Examples
### Basic Configuration
```yaml
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
db_path: ~/.llama/faiss_store.db
```
### With FileResponse Support
```yaml
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
db_path: ~/.llama/faiss_store.db
files:
- provider_id: local-files
provider_type: inline::localfs
config:
storage_dir: ~/.llama/files
metadata_store:
type: sqlite
db_path: ~/.llama/files_metadata.db
```
## Usage Examples
### Python Client
```python
from llama_stack import LlamaStackClient
client = LlamaStackClient("http://localhost:8000")
# Create vector store
vector_store = client.vector_stores.create(name="documents")
# Upload and process file
with open("document.pdf", "rb") as f:
file_info = await client.files.upload(file=f, purpose="assistants")
# Attach to vector store
await client.vector_stores.files.create(
vector_store_id=vector_store.id, file_id=file_info.id
)
# Search
results = await client.vector_stores.search(
vector_store_id=vector_store.id, query="What is the main topic?", max_num_results=5
)
```
### cURL Commands
```bash
# Upload file
curl -X POST http://localhost:8000/v1/openai/v1/files \
-F "file=@document.pdf" \
-F "purpose=assistants"
# Create vector store
curl -X POST http://localhost:8000/v1/openai/v1/vector_stores \
-H "Content-Type: application/json" \
-d '{"name": "documents"}'
# Attach file to vector store
curl -X POST http://localhost:8000/v1/openai/v1/vector_stores/{store_id}/files \
-H "Content-Type: application/json" \
-d '{"file_id": "file-abc123"}'
# Search vector store
curl -X POST http://localhost:8000/v1/openai/v1/vector_stores/{store_id}/search \
-H "Content-Type: application/json" \
-d '{"query": "What is the main topic?", "max_num_results": 5}'
```
## Performance Considerations
### Chunk Size Optimization
- **Small chunks (400-600 tokens)**: Better precision, more results
- **Large chunks (800-1200 tokens)**: Better context, fewer results
- **Overlap (50%)**: Maintains context between chunks
### Storage Efficiency
- **FAISS**: Fastest, but memory-limited
- **SQLite-vec**: Good balance of performance and storage
- **Milvus**: Scalable, production-ready
- **Remote providers**: Managed, but network-dependent
### Search Performance
- **Vector search**: Fastest for semantic queries
- **Hybrid search**: Best accuracy (SQLite-vec only)
- **Filtered search**: Fast with metadata constraints
## Troubleshooting
### Common Issues
1. **File Processing Failures**
- Check file format compatibility
- Verify file size limits
- Review error messages in file status
2. **Search Performance**
- Optimize chunk sizes for your use case
- Use filters to narrow search scope
- Monitor vector store metrics
3. **Storage Issues**
- Check available disk space
- Verify database permissions
- Monitor memory usage (for in-memory providers)
### Monitoring
```python
# Check file processing status
file_status = await client.vector_stores.files.retrieve(
vector_store_id=vector_store.id, file_id=file_info.id
)
if file_status.status == "failed":
print(f"Error: {file_status.last_error.message}")
# Monitor vector store health
health = await client.vector_stores.health(vector_store_id=vector_store.id)
print(f"Status: {health.status}")
```
## Best Practices
1. **File Organization**: Use descriptive names and organize by purpose
2. **Chunking Strategy**: Test different sizes for your specific use case
3. **Metadata**: Add relevant attributes for better filtering
4. **Monitoring**: Track processing status and search performance
5. **Cleanup**: Regularly remove unused files to manage storage
## Future Enhancements
Planned improvements for file operations support:
- **Batch Processing**: Process multiple files simultaneously
- **Advanced Chunking**: More sophisticated chunking algorithms
- **Custom Embeddings**: Support for custom embedding models
- **Real-time Updates**: Live file processing and indexing
- **Multi-format Support**: Enhanced file format support
## Support and Resources
- **Documentation**: [File Operations and Vector Store Integration](../../concepts/file_operations_vector_stores.mdx)
- **API Reference**: [Files API](files_api.md)
- **Provider Docs**: [Vector Store Providers](../vector_io/index.md)
- **Examples**: [Getting Started](../getting_started/index.md)
- **Community**: [GitHub Discussions](https://github.com/meta-llama/llama-stack/discussions)

View file

@ -22,6 +22,7 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
## Provider Categories ## Provider Categories
- **[External Providers](external/index.mdx)** - Guide for building and using external providers - **[External Providers](external/index.mdx)** - Guide for building and using external providers
- **[OpenAI Compatibility](../api-openai/index.mdx)** - OpenAI API compatibility layer
- **[Inference](inference/index.mdx)** - LLM and embedding model providers - **[Inference](inference/index.mdx)** - LLM and embedding model providers
- **[Agents](agents/index.mdx)** - Agentic system providers - **[Agents](agents/index.mdx)** - Agentic system providers
- **[DatasetIO](datasetio/index.mdx)** - Dataset and data loader providers - **[DatasetIO](datasetio/index.mdx)** - Dataset and data loader providers
@ -30,6 +31,16 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
- **[Tool Runtime](tool_runtime/index.mdx)** - Tool and protocol providers - **[Tool Runtime](tool_runtime/index.mdx)** - Tool and protocol providers
- **[Files](files/index.mdx)** - File system and storage providers - **[Files](files/index.mdx)** - File system and storage providers
## Other information about Providers ## API Documentation
- **[OpenAI Compatibility](./openai.mdx)** - OpenAI API compatibility layer
For comprehensive API documentation and reference:
- **[API Reference](../api/index.mdx)** - Complete API documentation
- **[Experimental APIs](../api-experimental/index.mdx)** - APIs in development
- **[Deprecated APIs](../api-deprecated/index.mdx)** - Legacy APIs being phased out
- **[OpenAI Compatibility](../api-openai/index.mdx)** - OpenAI API compatibility guide
## Additional Provider Information
- **[OpenAI Implementation Guide](./openai.mdx)** - Code examples and implementation details for OpenAI APIs
- **[OpenAI-Compatible Responses Limitations](./openai_responses_limitations.mdx)** - Known limitations of the Responses API in Llama Stack - **[OpenAI-Compatible Responses Limitations](./openai_responses_limitations.mdx)** - Known limitations of the Responses API in Llama Stack

View file

@ -1,5 +1,5 @@
--- ---
description: "AWS Bedrock inference provider for accessing various AI models through AWS's managed service." description: "AWS Bedrock inference provider using OpenAI compatible endpoint."
sidebar_label: Remote - Bedrock sidebar_label: Remote - Bedrock
title: remote::bedrock title: remote::bedrock
--- ---
@ -8,7 +8,7 @@ title: remote::bedrock
## Description ## Description
AWS Bedrock inference provider for accessing various AI models through AWS's managed service. AWS Bedrock inference provider using OpenAI compatible endpoint.
## Configuration ## Configuration
@ -16,19 +16,12 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider | | `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `aws_access_key_id` | `str \| None` | No | | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID | | `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `aws_secret_access_key` | `str \| None` | No | | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY | | `region_name` | `<class 'str'>` | No | us-east-2 | AWS Region for the Bedrock Runtime endpoint |
| `aws_session_token` | `str \| None` | No | | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |
| `region_name` | `str \| None` | No | | The default AWS Region to use, for example, us-west-1 or us-west-2.Default use environment variable: AWS_DEFAULT_REGION |
| `profile_name` | `str \| None` | No | | The profile name that contains credentials to use.Default use environment variable: AWS_PROFILE |
| `total_max_attempts` | `int \| None` | No | | An integer representing the maximum number of attempts that will be made for a single request, including the initial attempt. Default use environment variable: AWS_MAX_ATTEMPTS |
| `retry_mode` | `str \| None` | No | | A string representing the type of retries Boto3 will perform.Default use environment variable: AWS_RETRY_MODE |
| `connect_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to make a connection. The default is 60 seconds. |
| `read_timeout` | `float \| None` | No | 60.0 | The time in seconds till a timeout exception is thrown when attempting to read from a connection.The default is 60 seconds. |
| `session_ttl` | `int \| None` | No | 3600 | The time in seconds till a session expires. The default is 3600 seconds (1 hour). |
## Sample Configuration ## Sample Configuration
```yaml ```yaml
{} api_key: ${env.AWS_BEDROCK_API_KEY:=}
region_name: ${env.AWS_DEFAULT_REGION:=us-east-2}
``` ```

View file

@ -0,0 +1,41 @@
---
description: |
Oracle Cloud Infrastructure (OCI) Generative AI inference provider for accessing OCI's Generative AI Platform-as-a-Service models.
Provider documentation
https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm
sidebar_label: Remote - Oci
title: remote::oci
---
# remote::oci
## Description
Oracle Cloud Infrastructure (OCI) Generative AI inference provider for accessing OCI's Generative AI Platform-as-a-Service models.
Provider documentation
https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `oci_auth_type` | `<class 'str'>` | No | instance_principal | OCI authentication type (must be one of: instance_principal, config_file) |
| `oci_region` | `<class 'str'>` | No | us-ashburn-1 | OCI region (e.g., us-ashburn-1) |
| `oci_compartment_id` | `<class 'str'>` | No | | OCI compartment ID for the Generative AI service |
| `oci_config_file_path` | `<class 'str'>` | No | ~/.oci/config | OCI config file path (required if oci_auth_type is config_file) |
| `oci_config_profile` | `<class 'str'>` | No | DEFAULT | OCI config profile (required if oci_auth_type is config_file) |
## Sample Configuration
```yaml
oci_auth_type: ${env.OCI_AUTH_TYPE:=instance_principal}
oci_config_file_path: ${env.OCI_CONFIG_FILE_PATH:=~/.oci/config}
oci_config_profile: ${env.OCI_CLI_PROFILE:=DEFAULT}
oci_region: ${env.OCI_REGION:=us-ashburn-1}
oci_compartment_id: ${env.OCI_COMPARTMENT_OCID:=}
```

View file

@ -16,7 +16,7 @@ Passthrough inference provider for connecting to any external inference service
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider | | `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | API Key for the passthrouth endpoint | | `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `<class 'str'>` | No | | The URL for the passthrough endpoint | | `url` | `<class 'str'>` | No | | The URL for the passthrough endpoint |
## Sample Configuration ## Sample Configuration

View file

@ -1,9 +1,14 @@
--- ---
title: OpenAI Compatibility title: OpenAI Implementation Guide
description: OpenAI API Compatibility description: Code examples and implementation details for OpenAI API compatibility
sidebar_label: OpenAI Compatibility sidebar_label: OpenAI Implementation
sidebar_position: 1 sidebar_position: 2
--- ---
# OpenAI Implementation Guide
This guide provides detailed code examples and implementation details for using OpenAI-compatible APIs with Llama Stack. For a comprehensive overview of OpenAI compatibility features, see our [OpenAI API Compatibility Guide](../api-openai/index.mdx).
## OpenAI API Compatibility ## OpenAI API Compatibility
### Server path ### Server path
@ -195,3 +200,9 @@ Lines of code unfurl
Logic whispers in the dark Logic whispers in the dark
Art in hidden form Art in hidden form
``` ```
## Additional Resources
- **[OpenAI API Compatibility Guide](../api-openai/index.mdx)** - Comprehensive overview of OpenAI compatibility features
- **[OpenAI Responses API Limitations](./openai_responses_limitations.mdx)** - Detailed limitations and known issues
- **[Provider Documentation](../index.mdx)** - Complete provider ecosystem overview

View file

@ -48,11 +48,9 @@ Both OpenAI and Llama Stack support a web-search built-in tool. The [OpenAI doc
> The type of the web search tool. One of `web_search` or `web_search_2025_08_26`. > The type of the web search tool. One of `web_search` or `web_search_2025_08_26`.
In contrast, the [Llama Stack documentation](https://llamastack.github.io/docs/api/create-a-new-open-ai-response) says that the allowed values for `type` for web search are `MOD1`, `MOD2` and `MOD3`. Llama Stack now supports both `web_search` and `web_search_2025_08_26` types, matching OpenAI's API. For backward compatibility, Llama Stack also supports `web_search_preview` and `web_search_preview_2025_03_11` types.
Is that correct? If so, what are the meanings of each of them? It might make sense for the allowed values for OpenAI map to some values for Llama Stack so that code written to the OpenAI specification
also work with Llama Stack.
The OpenAI web search tool also has fields for `filters` and `user_location` which are not documented as options for Llama Stack. If feasible, it would be good to support these too. The OpenAI web search tool also has fields for `filters` and `user_location` which are not yet implemented in Llama Stack. If feasible, it would be good to support these too.
--- ---

View file

@ -37,7 +37,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# NBVAL_SKIP\n", "# NBVAL_SKIP\n",
"!pip install -U llama-stack\n", "!pip install -U llama-stack llama-stack-client\n",
"llama stack list-deps fireworks | xargs -L1 uv pip install\n" "llama stack list-deps fireworks | xargs -L1 uv pip install\n"
] ]
}, },

View file

@ -44,7 +44,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# NBVAL_SKIP\n", "# NBVAL_SKIP\n",
"!pip install -U llama-stack" "!pip install -U llama-stack llama-stack-client\n"
] ]
}, },
{ {

View file

@ -74,6 +74,7 @@
"source": [ "source": [
"```bash\n", "```bash\n",
"uv sync --extra dev\n", "uv sync --extra dev\n",
"uv pip install -U llama-stack-client\n",
"uv pip install -e .\n", "uv pip install -e .\n",
"source .venv/bin/activate\n", "source .venv/bin/activate\n",
"```" "```"

View file

@ -170,7 +170,7 @@ def _get_endpoint_functions(
for webmethod in webmethods: for webmethod in webmethods:
print(f"Processing {colored(func_name, 'white')}...") print(f"Processing {colored(func_name, 'white')}...")
operation_name = func_name operation_name = func_name
if webmethod.method == "GET": if webmethod.method == "GET":
prefix = "get" prefix = "get"
elif webmethod.method == "DELETE": elif webmethod.method == "DELETE":
@ -196,16 +196,10 @@ def _get_endpoint_functions(
def _get_defining_class(member_fn: str, derived_cls: type) -> type: def _get_defining_class(member_fn: str, derived_cls: type) -> type:
"Find the class in which a member function is first defined in a class inheritance hierarchy." "Find the class in which a member function is first defined in a class inheritance hierarchy."
# This import must be dynamic here
from llama_stack.apis.tools import RAGToolRuntime, ToolRuntime
# iterate in reverse member resolution order to find most specific class first # iterate in reverse member resolution order to find most specific class first
for cls in reversed(inspect.getmro(derived_cls)): for cls in reversed(inspect.getmro(derived_cls)):
for name, _ in inspect.getmembers(cls, inspect.isfunction): for name, _ in inspect.getmembers(cls, inspect.isfunction):
if name == member_fn: if name == member_fn:
# HACK ALERT
if cls == RAGToolRuntime:
return ToolRuntime
return cls return cls
raise ValidationError( raise ValidationError(

View file

@ -57,6 +57,7 @@ const sidebars: SidebarsConfig = {
'distributions/importing_as_library', 'distributions/importing_as_library',
'distributions/configuration', 'distributions/configuration',
'distributions/starting_llama_stack_server', 'distributions/starting_llama_stack_server',
'distributions/llama_stack_ui',
{ {
type: 'category', type: 'category',
label: 'Self-Hosted Distributions', label: 'Self-Hosted Distributions',

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -24,13 +24,13 @@ classifiers = [
"Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Scientific/Engineering :: Information Analysis",
] ]
dependencies = [ dependencies = [
"PyYAML>=6.0",
"aiohttp", "aiohttp",
"fastapi>=0.115.0,<1.0", # server "fastapi>=0.115.0,<1.0", # server
"fire", # for MCP in LLS client "fire", # for MCP in LLS client
"httpx", "httpx",
"jinja2>=3.1.6", "jinja2>=3.1.6",
"jsonschema", "jsonschema",
"llama-stack-client>=0.3.0",
"openai>=2.5.0", "openai>=2.5.0",
"prompt-toolkit", "prompt-toolkit",
"python-dotenv", "python-dotenv",
@ -52,11 +52,8 @@ dependencies = [
] ]
[project.optional-dependencies] [project.optional-dependencies]
ui = [ client = [
"streamlit", "llama-stack-client>=0.3.0", # Optional for library-only usage
"pandas",
"llama-stack-client>=0.3.0",
"streamlit-option-menu",
] ]
[dependency-groups] [dependency-groups]
@ -104,6 +101,7 @@ type_checking = [
"lm-format-enforcer", "lm-format-enforcer",
"mcp", "mcp",
"ollama", "ollama",
"llama-stack-client>=0.3.0",
] ]
# These are the dependencies required for running unit tests. # These are the dependencies required for running unit tests.
unit = [ unit = [
@ -114,7 +112,7 @@ unit = [
"aiosqlite", "aiosqlite",
"aiohttp", "aiohttp",
"psycopg2-binary>=2.9.0", "psycopg2-binary>=2.9.0",
"pypdf", "pypdf>=6.1.3",
"mcp", "mcp",
"chardet", "chardet",
"sqlalchemy", "sqlalchemy",
@ -137,7 +135,7 @@ test = [
"torchvision>=0.21.0", "torchvision>=0.21.0",
"chardet", "chardet",
"psycopg2-binary>=2.9.0", "psycopg2-binary>=2.9.0",
"pypdf", "pypdf>=6.1.3",
"mcp", "mcp",
"datasets>=4.0.0", "datasets>=4.0.0",
"autoevals", "autoevals",
@ -300,6 +298,7 @@ exclude = [
"^src/llama_stack/providers/remote/agents/sample/", "^src/llama_stack/providers/remote/agents/sample/",
"^src/llama_stack/providers/remote/datasetio/huggingface/", "^src/llama_stack/providers/remote/datasetio/huggingface/",
"^src/llama_stack/providers/remote/datasetio/nvidia/", "^src/llama_stack/providers/remote/datasetio/nvidia/",
"^src/llama_stack/providers/remote/inference/oci/",
"^src/llama_stack/providers/remote/inference/bedrock/", "^src/llama_stack/providers/remote/inference/bedrock/",
"^src/llama_stack/providers/remote/inference/nvidia/", "^src/llama_stack/providers/remote/inference/nvidia/",
"^src/llama_stack/providers/remote/inference/passthrough/", "^src/llama_stack/providers/remote/inference/passthrough/",

272
scripts/cleanup_recordings.py Executable file
View file

@ -0,0 +1,272 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Clean up unused test recordings based on CI test collection.
This script:
1. Reads CI matrix definitions from tests/integration/ci_matrix.json (default + scheduled overrides)
2. Uses pytest --collect-only with --json-report to gather all test IDs that run in CI
3. Compares against existing recordings to identify unused ones
4. Optionally deletes unused recordings
Usage:
# Dry run - see what would be deleted
./scripts/cleanup_recordings.py
# Save manifest of CI test IDs for inspection
./scripts/cleanup_recordings.py --manifest ci_tests.txt
# Actually delete unused recordings
./scripts/cleanup_recordings.py --delete
"""
import argparse
import json
import os
import subprocess
import tempfile
from collections import defaultdict
from pathlib import Path
REPO_ROOT = Path(__file__).parent.parent
# Load CI matrix from JSON file
CI_MATRIX_FILE = REPO_ROOT / "tests/integration/ci_matrix.json"
with open(CI_MATRIX_FILE) as f:
_matrix_config = json.load(f)
DEFAULT_CI_MATRIX: list[dict[str, str]] = _matrix_config["default"]
SCHEDULED_MATRICES: dict[str, list[dict[str, str]]] = _matrix_config.get("schedules", {})
def _unique_configs(entries):
seen: set[tuple[str, str]] = set()
for entry in entries:
suite = entry["suite"]
setup = entry["setup"]
key = (suite, setup)
if key in seen:
continue
seen.add(key)
yield {"suite": suite, "setup": setup}
def iter_all_ci_configs() -> list[dict[str, str]]:
"""Return unique CI configs across default and scheduled matrices."""
combined = list(DEFAULT_CI_MATRIX)
for configs in SCHEDULED_MATRICES.values():
combined.extend(configs)
return list(_unique_configs(combined))
def collect_ci_tests():
"""Collect all test IDs that would run in CI using --collect-only with JSON output."""
all_test_ids = set()
configs = iter_all_ci_configs()
for config in configs:
print(f"Collecting tests for suite={config['suite']}, setup={config['setup']}...")
# Create a temporary file for JSON report
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
json_report_file = f.name
try:
# Configure environment for collection run
env = os.environ.copy()
env["PYTEST_ADDOPTS"] = f"--json-report --json-report-file={json_report_file}"
repo_path = str(REPO_ROOT)
existing_path = env.get("PYTHONPATH", "")
env["PYTHONPATH"] = f"{repo_path}{os.pathsep}{existing_path}" if existing_path else repo_path
result = subprocess.run(
[
"./scripts/integration-tests.sh",
"--collect-only",
"--suite",
config["suite"],
"--setup",
config["setup"],
],
capture_output=True,
text=True,
cwd=REPO_ROOT,
env=env,
)
if result.returncode != 0:
raise RuntimeError(
"Test collection failed.\n"
f"Command: {' '.join(result.args)}\n"
f"stdout:\n{result.stdout}\n"
f"stderr:\n{result.stderr}"
)
# Parse JSON report to extract test IDs
try:
with open(json_report_file) as f:
report = json.load(f)
# The "collectors" field contains collected test items
# Each collector has a "result" array with test node IDs
for collector in report.get("collectors", []):
for item in collector.get("result", []):
# The "nodeid" field is the test ID
if "nodeid" in item:
all_test_ids.add(item["nodeid"])
print(f" Collected {len(all_test_ids)} test IDs so far")
except (json.JSONDecodeError, FileNotFoundError) as e:
print(f" Warning: Failed to parse JSON report: {e}")
continue
finally:
# Clean up temp file
if os.path.exists(json_report_file):
os.unlink(json_report_file)
print(f"\nTotal unique test IDs collected: {len(all_test_ids)}")
return all_test_ids, configs
def get_base_test_id(test_id: str) -> str:
"""Extract base test ID without parameterization.
Example:
'tests/integration/inference/test_foo.py::test_bar[param1-param2]'
-> 'tests/integration/inference/test_foo.py::test_bar'
"""
return test_id.split("[")[0] if "[" in test_id else test_id
def find_all_recordings():
"""Find all recording JSON files."""
return list((REPO_ROOT / "tests/integration").rglob("recordings/*.json"))
def analyze_recordings(ci_test_ids, dry_run=True):
"""Analyze recordings and identify unused ones."""
# Use full test IDs with parameterization for exact matching
all_recordings = find_all_recordings()
print(f"\nTotal recording files: {len(all_recordings)}")
# Categorize recordings
used_recordings = []
unused_recordings = []
shared_recordings = [] # model-list endpoints without test_id
parse_errors = []
for json_file in all_recordings:
try:
with open(json_file) as f:
data = json.load(f)
test_id = data.get("test_id", "")
if not test_id:
# Shared/infrastructure recordings (model lists, etc)
shared_recordings.append(json_file)
continue
# Match exact test_id (with full parameterization)
if test_id in ci_test_ids:
used_recordings.append(json_file)
else:
unused_recordings.append((json_file, test_id))
except Exception as e:
parse_errors.append((json_file, str(e)))
# Print summary
print("\nRecording Analysis:")
print(f" Used in CI: {len(used_recordings)}")
print(f" Shared (no ID): {len(shared_recordings)}")
print(f" UNUSED: {len(unused_recordings)}")
print(f" Parse errors: {len(parse_errors)}")
if unused_recordings:
print("\nUnused recordings by test:")
# Group by base test ID
by_test = defaultdict(list)
for file, test_id in unused_recordings:
base = get_base_test_id(test_id)
by_test[base].append(file)
for base_test, files in sorted(by_test.items()):
print(f"\n {base_test}")
print(f" ({len(files)} recording(s))")
for f in files[:3]:
print(f" - {f.relative_to(REPO_ROOT / 'tests/integration')}")
if len(files) > 3:
print(f" ... and {len(files) - 3} more")
if parse_errors:
print("\nParse errors:")
for file, error in parse_errors[:5]:
print(f" {file.relative_to(REPO_ROOT)}: {error}")
if len(parse_errors) > 5:
print(f" ... and {len(parse_errors) - 5} more")
# Perform cleanup
if not dry_run:
print(f"\nDeleting {len(unused_recordings)} unused recordings...")
for file, _ in unused_recordings:
file.unlink()
print(f" Deleted: {file.relative_to(REPO_ROOT / 'tests/integration')}")
print("✅ Cleanup complete")
else:
print("\n(Dry run - no files deleted)")
print("\nTo delete these files, run with --delete")
return len(unused_recordings)
def main():
parser = argparse.ArgumentParser(
description="Clean up unused test recordings based on CI test collection",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument("--delete", action="store_true", help="Actually delete unused recordings (default is dry-run)")
parser.add_argument("--manifest", help="Save collected test IDs to file (optional)")
args = parser.parse_args()
print("=" * 60)
print("Recording Cleanup Utility")
print("=" * 60)
ci_configs = iter_all_ci_configs()
print(f"\nDetected CI configurations: {len(ci_configs)}")
for config in ci_configs:
print(f" - suite={config['suite']}, setup={config['setup']}")
# Collect test IDs from CI configurations
ci_test_ids, _ = collect_ci_tests()
if args.manifest:
with open(args.manifest, "w") as f:
for test_id in sorted(ci_test_ids):
f.write(f"{test_id}\n")
print(f"\nSaved test IDs to: {args.manifest}")
# Analyze and cleanup
unused_count = analyze_recordings(ci_test_ids, dry_run=not args.delete)
print("\n" + "=" * 60)
if unused_count > 0 and not args.delete:
print("Run with --delete to remove unused recordings")
if __name__ == "__main__":
main()

61
scripts/generate_ci_matrix.py Executable file
View file

@ -0,0 +1,61 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Generate CI test matrix from ci_matrix.json with schedule/input overrides.
This script is used by .github/workflows/integration-tests.yml to generate
the test matrix dynamically based on the CI_MATRIX definition.
"""
import json
from pathlib import Path
CI_MATRIX_FILE = Path(__file__).parent.parent / "tests/integration/ci_matrix.json"
with open(CI_MATRIX_FILE) as f:
matrix_config = json.load(f)
DEFAULT_MATRIX = matrix_config["default"]
SCHEDULE_MATRICES: dict[str, list[dict[str, str]]] = matrix_config.get("schedules", {})
def generate_matrix(schedule="", test_setup=""):
"""
Generate test matrix based on schedule or manual input.
Args:
schedule: GitHub cron schedule string (e.g., "1 0 * * 0" for weekly)
test_setup: Manual test setup input (e.g., "ollama-vision")
Returns:
Matrix configuration as JSON string
"""
# Weekly scheduled test matrices
if schedule and schedule in SCHEDULE_MATRICES:
matrix = SCHEDULE_MATRICES[schedule]
# Manual input for specific setup
elif test_setup == "ollama-vision":
matrix = [{"suite": "vision", "setup": "ollama-vision"}]
# Default: use JSON-defined matrix
else:
matrix = DEFAULT_MATRIX
# GitHub Actions expects {"include": [...]} format
return json.dumps({"include": matrix})
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate CI test matrix")
parser.add_argument("--schedule", default="", help="GitHub schedule cron string")
parser.add_argument("--test-setup", default="", help="Manual test setup input")
args = parser.parse_args()
print(generate_matrix(args.schedule, args.test_setup))

View file

@ -162,6 +162,17 @@ if [[ "$COLLECT_ONLY" == false ]]; then
export LLAMA_STACK_TEST_STACK_CONFIG_TYPE="library_client" export LLAMA_STACK_TEST_STACK_CONFIG_TYPE="library_client"
echo "Setting stack config type: library_client" echo "Setting stack config type: library_client"
fi fi
# Set MCP host for in-process MCP server tests
# - For library client and server mode: localhost (both on same host)
# - For docker mode: host.docker.internal (container needs to reach host)
if [[ "$STACK_CONFIG" == docker:* ]]; then
export LLAMA_STACK_TEST_MCP_HOST="host.docker.internal"
echo "Setting MCP host: host.docker.internal (docker mode)"
else
export LLAMA_STACK_TEST_MCP_HOST="localhost"
echo "Setting MCP host: localhost (library/server mode)"
fi
fi fi
SETUP_ENV=$(PYTHONPATH=$THIS_DIR/.. python "$THIS_DIR/get_setup_env.py" --suite "$TEST_SUITE" --setup "$TEST_SETUP" --format bash) SETUP_ENV=$(PYTHONPATH=$THIS_DIR/.. python "$THIS_DIR/get_setup_env.py" --suite "$TEST_SUITE" --setup "$TEST_SETUP" --format bash)
@ -227,14 +238,16 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
echo "=== Starting Llama Stack Server ===" echo "=== Starting Llama Stack Server ==="
export LLAMA_STACK_LOG_WIDTH=120 export LLAMA_STACK_LOG_WIDTH=120
# Configure telemetry collector for server mode # Configure telemetry collector for server mode
# Use a fixed port for the OTEL collector so the server can connect to it # Use a fixed port for the OTEL collector so the server can connect to it
COLLECTOR_PORT=4317 COLLECTOR_PORT=4317
export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}" export LLAMA_STACK_TEST_COLLECTOR_PORT="${COLLECTOR_PORT}"
export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}" # Disabled: https://github.com/llamastack/llama-stack/issues/4089
export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf" #export OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:${COLLECTOR_PORT}"
export OTEL_BSP_SCHEDULE_DELAY="200" export OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf"
export OTEL_BSP_EXPORT_TIMEOUT="2000" export OTEL_BSP_SCHEDULE_DELAY="200"
export OTEL_BSP_EXPORT_TIMEOUT="2000"
export OTEL_METRIC_EXPORT_INTERVAL="200"
# remove "server:" from STACK_CONFIG # remove "server:" from STACK_CONFIG
stack_config=$(echo "$STACK_CONFIG" | sed 's/^server://') stack_config=$(echo "$STACK_CONFIG" | sed 's/^server://')
@ -336,7 +349,12 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
DOCKER_ENV_VARS="" DOCKER_ENV_VARS=""
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE" DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_INFERENCE_MODE=$INFERENCE_MODE"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server" DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_STACK_CONFIG_TYPE=server"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}" DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e LLAMA_STACK_TEST_MCP_HOST=${LLAMA_STACK_TEST_MCP_HOST:-host.docker.internal}"
# Disabled: https://github.com/llamastack/llama-stack/issues/4089
#DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:${COLLECTOR_PORT}"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_METRIC_EXPORT_INTERVAL=200"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_SCHEDULE_DELAY=200"
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OTEL_BSP_EXPORT_TIMEOUT=2000"
# Pass through API keys if they exist # Pass through API keys if they exist
[ -n "${TOGETHER_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TOGETHER_API_KEY=$TOGETHER_API_KEY" [ -n "${TOGETHER_API_KEY:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e TOGETHER_API_KEY=$TOGETHER_API_KEY"
@ -349,6 +367,10 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
[ -n "${OLLAMA_URL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL" [ -n "${OLLAMA_URL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e OLLAMA_URL=$OLLAMA_URL"
[ -n "${SAFETY_MODEL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e SAFETY_MODEL=$SAFETY_MODEL" [ -n "${SAFETY_MODEL:-}" ] && DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e SAFETY_MODEL=$SAFETY_MODEL"
if [[ "$TEST_SETUP" == "vllm" ]]; then
DOCKER_ENV_VARS="$DOCKER_ENV_VARS -e VLLM_URL=http://localhost:8000/v1"
fi
# Determine the actual image name (may have localhost/ prefix) # Determine the actual image name (may have localhost/ prefix)
IMAGE_NAME=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "distribution-$DISTRO:dev$" | head -1) IMAGE_NAME=$(docker images --format "{{.Repository}}:{{.Tag}}" | grep "distribution-$DISTRO:dev$" | head -1)
if [[ -z "$IMAGE_NAME" ]]; then if [[ -z "$IMAGE_NAME" ]]; then
@ -361,8 +383,11 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
# Use regular port mapping instead # Use regular port mapping instead
NETWORK_MODE="" NETWORK_MODE=""
PORT_MAPPINGS="" PORT_MAPPINGS=""
ADD_HOST_FLAG=""
if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then if [[ "$(uname)" != "Darwin" ]] && [[ "$(uname)" != *"MINGW"* ]]; then
NETWORK_MODE="--network host" NETWORK_MODE="--network host"
# On Linux with host network, also add host.docker.internal mapping for consistency
ADD_HOST_FLAG="--add-host=host.docker.internal:host-gateway"
else else
# On non-Linux (macOS, Windows), need explicit port mappings for both app and telemetry # On non-Linux (macOS, Windows), need explicit port mappings for both app and telemetry
PORT_MAPPINGS="-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT -p $COLLECTOR_PORT:$COLLECTOR_PORT" PORT_MAPPINGS="-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT -p $COLLECTOR_PORT:$COLLECTOR_PORT"
@ -371,6 +396,7 @@ if [[ "$STACK_CONFIG" == *"docker:"* && "$COLLECT_ONLY" == false ]]; then
docker run -d $NETWORK_MODE --name "$container_name" \ docker run -d $NETWORK_MODE --name "$container_name" \
$PORT_MAPPINGS \ $PORT_MAPPINGS \
$ADD_HOST_FLAG \
$DOCKER_ENV_VARS \ $DOCKER_ENV_VARS \
"$IMAGE_NAME" \ "$IMAGE_NAME" \
--port $LLAMA_STACK_PORT --port $LLAMA_STACK_PORT
@ -401,11 +427,6 @@ fi
echo "=== Running Integration Tests ===" echo "=== Running Integration Tests ==="
EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag" EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag"
# Additional exclusions for vllm setup
if [[ "$TEST_SETUP" == "vllm" ]]; then
EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
fi
PYTEST_PATTERN="not( $EXCLUDE_TESTS )" PYTEST_PATTERN="not( $EXCLUDE_TESTS )"
if [[ -n "$TEST_PATTERN" ]]; then if [[ -n "$TEST_PATTERN" ]]; then
PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN" PYTEST_PATTERN="${PYTEST_PATTERN} and $TEST_PATTERN"

View file

@ -6,7 +6,7 @@
# the root directory of this source tree. # the root directory of this source tree.
set -e set -e
cd src/llama_stack/ui cd src/llama_stack_ui
if [ ! -d node_modules ] || [ ! -x node_modules/.bin/prettier ] || [ ! -x node_modules/.bin/eslint ]; then if [ ! -d node_modules ] || [ ! -x node_modules/.bin/prettier ] || [ ! -x node_modules/.bin/eslint ]; then
echo "UI dependencies not installed, skipping prettier/linter check" echo "UI dependencies not installed, skipping prettier/linter check"

View file

@ -3,8 +3,3 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.core.library_client import ( # noqa: F401
AsyncLlamaStackAsLibraryClient,
LlamaStackAsLibraryClient,
)

View file

@ -5,30 +5,13 @@
# the root directory of this source tree. # the root directory of this source tree.
from collections.abc import AsyncIterator from collections.abc import AsyncIterator
from datetime import datetime from typing import Annotated, Protocol, runtime_checkable
from enum import StrEnum
from typing import Annotated, Any, Literal, Protocol, runtime_checkable
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel
from llama_stack.apis.common.content_types import URL, ContentDelta, InterleavedContent from llama_stack.apis.common.responses import Order
from llama_stack.apis.common.responses import Order, PaginatedResponse from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.apis.inference import ( from llama_stack.schema_utils import ExtraBodyField, json_schema_type, webmethod
CompletionMessage,
ResponseFormat,
SamplingParams,
ToolCall,
ToolChoice,
ToolConfig,
ToolPromptFormat,
ToolResponse,
ToolResponseMessage,
UserMessage,
)
from llama_stack.apis.safety import SafetyViolation
from llama_stack.apis.tools import ToolDef
from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
from llama_stack.schema_utils import ExtraBodyField, json_schema_type, register_schema, webmethod
from .openai_responses import ( from .openai_responses import (
ListOpenAIResponseInputItem, ListOpenAIResponseInputItem,
@ -57,658 +40,12 @@ class ResponseGuardrailSpec(BaseModel):
ResponseGuardrail = str | ResponseGuardrailSpec ResponseGuardrail = str | ResponseGuardrailSpec
class Attachment(BaseModel):
"""An attachment to an agent turn.
:param content: The content of the attachment.
:param mime_type: The MIME type of the attachment.
"""
content: InterleavedContent | URL
mime_type: str
class Document(BaseModel):
"""A document to be used by an agent.
:param content: The content of the document.
:param mime_type: The MIME type of the document.
"""
content: InterleavedContent | URL
mime_type: str
class StepCommon(BaseModel):
"""A common step in an agent turn.
:param turn_id: The ID of the turn.
:param step_id: The ID of the step.
:param started_at: The time the step started.
:param completed_at: The time the step completed.
"""
turn_id: str
step_id: str
started_at: datetime | None = None
completed_at: datetime | None = None
class StepType(StrEnum):
"""Type of the step in an agent turn.
:cvar inference: The step is an inference step that calls an LLM.
:cvar tool_execution: The step is a tool execution step that executes a tool call.
:cvar shield_call: The step is a shield call step that checks for safety violations.
:cvar memory_retrieval: The step is a memory retrieval step that retrieves context for vector dbs.
"""
inference = "inference"
tool_execution = "tool_execution"
shield_call = "shield_call"
memory_retrieval = "memory_retrieval"
@json_schema_type
class InferenceStep(StepCommon):
"""An inference step in an agent turn.
:param model_response: The response from the LLM.
"""
model_config = ConfigDict(protected_namespaces=())
step_type: Literal[StepType.inference] = StepType.inference
model_response: CompletionMessage
@json_schema_type
class ToolExecutionStep(StepCommon):
"""A tool execution step in an agent turn.
:param tool_calls: The tool calls to execute.
:param tool_responses: The tool responses from the tool calls.
"""
step_type: Literal[StepType.tool_execution] = StepType.tool_execution
tool_calls: list[ToolCall]
tool_responses: list[ToolResponse]
@json_schema_type
class ShieldCallStep(StepCommon):
"""A shield call step in an agent turn.
:param violation: The violation from the shield call.
"""
step_type: Literal[StepType.shield_call] = StepType.shield_call
violation: SafetyViolation | None
@json_schema_type
class MemoryRetrievalStep(StepCommon):
"""A memory retrieval step in an agent turn.
:param vector_store_ids: The IDs of the vector databases to retrieve context from.
:param inserted_context: The context retrieved from the vector databases.
"""
step_type: Literal[StepType.memory_retrieval] = StepType.memory_retrieval
# TODO: should this be List[str]?
vector_store_ids: str
inserted_context: InterleavedContent
Step = Annotated[
InferenceStep | ToolExecutionStep | ShieldCallStep | MemoryRetrievalStep,
Field(discriminator="step_type"),
]
@json_schema_type
class Turn(BaseModel):
"""A single turn in an interaction with an Agentic System.
:param turn_id: Unique identifier for the turn within a session
:param session_id: Unique identifier for the conversation session
:param input_messages: List of messages that initiated this turn
:param steps: Ordered list of processing steps executed during this turn
:param output_message: The model's generated response containing content and metadata
:param output_attachments: (Optional) Files or media attached to the agent's response
:param started_at: Timestamp when the turn began
:param completed_at: (Optional) Timestamp when the turn finished, if completed
"""
turn_id: str
session_id: str
input_messages: list[UserMessage | ToolResponseMessage]
steps: list[Step]
output_message: CompletionMessage
output_attachments: list[Attachment] | None = Field(default_factory=lambda: [])
started_at: datetime
completed_at: datetime | None = None
@json_schema_type
class Session(BaseModel):
"""A single session of an interaction with an Agentic System.
:param session_id: Unique identifier for the conversation session
:param session_name: Human-readable name for the session
:param turns: List of all turns that have occurred in this session
:param started_at: Timestamp when the session was created
"""
session_id: str
session_name: str
turns: list[Turn]
started_at: datetime
class AgentToolGroupWithArgs(BaseModel):
name: str
args: dict[str, Any]
AgentToolGroup = str | AgentToolGroupWithArgs
register_schema(AgentToolGroup, name="AgentTool")
class AgentConfigCommon(BaseModel):
sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
input_shields: list[str] | None = Field(default_factory=lambda: [])
output_shields: list[str] | None = Field(default_factory=lambda: [])
toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
client_tools: list[ToolDef] | None = Field(default_factory=lambda: [])
tool_choice: ToolChoice | None = Field(default=None, deprecated="use tool_config instead")
tool_prompt_format: ToolPromptFormat | None = Field(default=None, deprecated="use tool_config instead")
tool_config: ToolConfig | None = Field(default=None)
max_infer_iters: int | None = 10
def model_post_init(self, __context):
if self.tool_config:
if self.tool_choice and self.tool_config.tool_choice != self.tool_choice:
raise ValueError("tool_choice is deprecated. Use tool_choice in tool_config instead.")
if self.tool_prompt_format and self.tool_config.tool_prompt_format != self.tool_prompt_format:
raise ValueError("tool_prompt_format is deprecated. Use tool_prompt_format in tool_config instead.")
else:
params = {}
if self.tool_choice:
params["tool_choice"] = self.tool_choice
if self.tool_prompt_format:
params["tool_prompt_format"] = self.tool_prompt_format
self.tool_config = ToolConfig(**params)
@json_schema_type
class AgentConfig(AgentConfigCommon):
"""Configuration for an agent.
:param model: The model identifier to use for the agent
:param instructions: The system instructions for the agent
:param name: Optional name for the agent, used in telemetry and identification
:param enable_session_persistence: Optional flag indicating whether session data has to be persisted
:param response_format: Optional response format configuration
"""
model: str
instructions: str
name: str | None = None
enable_session_persistence: bool | None = False
response_format: ResponseFormat | None = None
@json_schema_type
class Agent(BaseModel):
"""An agent instance with configuration and metadata.
:param agent_id: Unique identifier for the agent
:param agent_config: Configuration settings for the agent
:param created_at: Timestamp when the agent was created
"""
agent_id: str
agent_config: AgentConfig
created_at: datetime
class AgentConfigOverridablePerTurn(AgentConfigCommon):
instructions: str | None = None
class AgentTurnResponseEventType(StrEnum):
step_start = "step_start"
step_complete = "step_complete"
step_progress = "step_progress"
turn_start = "turn_start"
turn_complete = "turn_complete"
turn_awaiting_input = "turn_awaiting_input"
@json_schema_type
class AgentTurnResponseStepStartPayload(BaseModel):
"""Payload for step start events in agent turn responses.
:param event_type: Type of event being reported
:param step_type: Type of step being executed
:param step_id: Unique identifier for the step within a turn
:param metadata: (Optional) Additional metadata for the step
"""
event_type: Literal[AgentTurnResponseEventType.step_start] = AgentTurnResponseEventType.step_start
step_type: StepType
step_id: str
metadata: dict[str, Any] | None = Field(default_factory=lambda: {})
@json_schema_type
class AgentTurnResponseStepCompletePayload(BaseModel):
"""Payload for step completion events in agent turn responses.
:param event_type: Type of event being reported
:param step_type: Type of step being executed
:param step_id: Unique identifier for the step within a turn
:param step_details: Complete details of the executed step
"""
event_type: Literal[AgentTurnResponseEventType.step_complete] = AgentTurnResponseEventType.step_complete
step_type: StepType
step_id: str
step_details: Step
@json_schema_type
class AgentTurnResponseStepProgressPayload(BaseModel):
"""Payload for step progress events in agent turn responses.
:param event_type: Type of event being reported
:param step_type: Type of step being executed
:param step_id: Unique identifier for the step within a turn
:param delta: Incremental content changes during step execution
"""
model_config = ConfigDict(protected_namespaces=())
event_type: Literal[AgentTurnResponseEventType.step_progress] = AgentTurnResponseEventType.step_progress
step_type: StepType
step_id: str
delta: ContentDelta
@json_schema_type
class AgentTurnResponseTurnStartPayload(BaseModel):
"""Payload for turn start events in agent turn responses.
:param event_type: Type of event being reported
:param turn_id: Unique identifier for the turn within a session
"""
event_type: Literal[AgentTurnResponseEventType.turn_start] = AgentTurnResponseEventType.turn_start
turn_id: str
@json_schema_type
class AgentTurnResponseTurnCompletePayload(BaseModel):
"""Payload for turn completion events in agent turn responses.
:param event_type: Type of event being reported
:param turn: Complete turn data including all steps and results
"""
event_type: Literal[AgentTurnResponseEventType.turn_complete] = AgentTurnResponseEventType.turn_complete
turn: Turn
@json_schema_type
class AgentTurnResponseTurnAwaitingInputPayload(BaseModel):
"""Payload for turn awaiting input events in agent turn responses.
:param event_type: Type of event being reported
:param turn: Turn data when waiting for external tool responses
"""
event_type: Literal[AgentTurnResponseEventType.turn_awaiting_input] = AgentTurnResponseEventType.turn_awaiting_input
turn: Turn
AgentTurnResponseEventPayload = Annotated[
AgentTurnResponseStepStartPayload
| AgentTurnResponseStepProgressPayload
| AgentTurnResponseStepCompletePayload
| AgentTurnResponseTurnStartPayload
| AgentTurnResponseTurnCompletePayload
| AgentTurnResponseTurnAwaitingInputPayload,
Field(discriminator="event_type"),
]
register_schema(AgentTurnResponseEventPayload, name="AgentTurnResponseEventPayload")
@json_schema_type
class AgentTurnResponseEvent(BaseModel):
"""An event in an agent turn response stream.
:param payload: Event-specific payload containing event data
"""
payload: AgentTurnResponseEventPayload
@json_schema_type
class AgentCreateResponse(BaseModel):
"""Response returned when creating a new agent.
:param agent_id: Unique identifier for the created agent
"""
agent_id: str
@json_schema_type
class AgentSessionCreateResponse(BaseModel):
"""Response returned when creating a new agent session.
:param session_id: Unique identifier for the created session
"""
session_id: str
@json_schema_type
class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
"""Request to create a new turn for an agent.
:param agent_id: Unique identifier for the agent
:param session_id: Unique identifier for the conversation session
:param messages: List of messages to start the turn with
:param documents: (Optional) List of documents to provide to the agent
:param toolgroups: (Optional) List of tool groups to make available for this turn
:param stream: (Optional) Whether to stream the response
:param tool_config: (Optional) Tool configuration to override agent defaults
"""
agent_id: str
session_id: str
# TODO: figure out how we can simplify this and make why
# ToolResponseMessage needs to be here (it is function call
# execution from outside the system)
messages: list[UserMessage | ToolResponseMessage]
documents: list[Document] | None = None
toolgroups: list[AgentToolGroup] | None = Field(default_factory=lambda: [])
stream: bool | None = False
tool_config: ToolConfig | None = None
@json_schema_type
class AgentTurnResumeRequest(BaseModel):
"""Request to resume an agent turn with tool responses.
:param agent_id: Unique identifier for the agent
:param session_id: Unique identifier for the conversation session
:param turn_id: Unique identifier for the turn within a session
:param tool_responses: List of tool responses to submit to continue the turn
:param stream: (Optional) Whether to stream the response
"""
agent_id: str
session_id: str
turn_id: str
tool_responses: list[ToolResponse]
stream: bool | None = False
@json_schema_type
class AgentTurnResponseStreamChunk(BaseModel):
"""Streamed agent turn completion response.
:param event: Individual event in the agent turn response stream
"""
event: AgentTurnResponseEvent
@json_schema_type
class AgentStepResponse(BaseModel):
"""Response containing details of a specific agent step.
:param step: The complete step data and execution details
"""
step: Step
@runtime_checkable @runtime_checkable
class Agents(Protocol): class Agents(Protocol):
"""Agents """Agents
APIs for creating and interacting with agentic systems.""" APIs for creating and interacting with agentic systems."""
@webmethod(
route="/agents",
method="POST",
descriptive_name="create_agent",
level=LLAMA_STACK_API_V1ALPHA,
)
async def create_agent(
self,
agent_config: AgentConfig,
) -> AgentCreateResponse:
"""Create an agent with the given configuration.
:param agent_config: The configuration for the agent.
:returns: An AgentCreateResponse with the agent ID.
"""
...
@webmethod(
route="/agents/{agent_id}/session/{session_id}/turn",
method="POST",
descriptive_name="create_agent_turn",
level=LLAMA_STACK_API_V1ALPHA,
)
async def create_agent_turn(
self,
agent_id: str,
session_id: str,
messages: list[UserMessage | ToolResponseMessage],
stream: bool | None = False,
documents: list[Document] | None = None,
toolgroups: list[AgentToolGroup] | None = None,
tool_config: ToolConfig | None = None,
) -> Turn | AsyncIterator[AgentTurnResponseStreamChunk]:
"""Create a new turn for an agent.
:param agent_id: The ID of the agent to create the turn for.
:param session_id: The ID of the session to create the turn for.
:param messages: List of messages to start the turn with.
:param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
:param documents: (Optional) List of documents to create the turn with.
:param toolgroups: (Optional) List of toolgroups to create the turn with, will be used in addition to the agent's config toolgroups for the request.
:param tool_config: (Optional) The tool configuration to create the turn with, will be used to override the agent's tool_config.
:returns: If stream=False, returns a Turn object.
If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk.
"""
...
@webmethod(
route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
method="POST",
descriptive_name="resume_agent_turn",
level=LLAMA_STACK_API_V1ALPHA,
)
async def resume_agent_turn(
self,
agent_id: str,
session_id: str,
turn_id: str,
tool_responses: list[ToolResponse],
stream: bool | None = False,
) -> Turn | AsyncIterator[AgentTurnResponseStreamChunk]:
"""Resume an agent turn with executed tool call responses.
When a Turn has the status `awaiting_input` due to pending input from client side tool calls, this endpoint can be used to submit the outputs from the tool calls once they are ready.
:param agent_id: The ID of the agent to resume.
:param session_id: The ID of the session to resume.
:param turn_id: The ID of the turn to resume.
:param tool_responses: The tool call responses to resume the turn with.
:param stream: Whether to stream the response.
:returns: A Turn object if stream is False, otherwise an AsyncIterator of AgentTurnResponseStreamChunk objects.
"""
...
@webmethod(
route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
method="GET",
level=LLAMA_STACK_API_V1ALPHA,
)
async def get_agents_turn(
self,
agent_id: str,
session_id: str,
turn_id: str,
) -> Turn:
"""Retrieve an agent turn by its ID.
:param agent_id: The ID of the agent to get the turn for.
:param session_id: The ID of the session to get the turn for.
:param turn_id: The ID of the turn to get.
:returns: A Turn.
"""
...
@webmethod(
route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
method="GET",
level=LLAMA_STACK_API_V1ALPHA,
)
async def get_agents_step(
self,
agent_id: str,
session_id: str,
turn_id: str,
step_id: str,
) -> AgentStepResponse:
"""Retrieve an agent step by its ID.
:param agent_id: The ID of the agent to get the step for.
:param session_id: The ID of the session to get the step for.
:param turn_id: The ID of the turn to get the step for.
:param step_id: The ID of the step to get.
:returns: An AgentStepResponse.
"""
...
@webmethod(
route="/agents/{agent_id}/session",
method="POST",
descriptive_name="create_agent_session",
level=LLAMA_STACK_API_V1ALPHA,
)
async def create_agent_session(
self,
agent_id: str,
session_name: str,
) -> AgentSessionCreateResponse:
"""Create a new session for an agent.
:param agent_id: The ID of the agent to create the session for.
:param session_name: The name of the session to create.
:returns: An AgentSessionCreateResponse.
"""
...
@webmethod(
route="/agents/{agent_id}/session/{session_id}",
method="GET",
level=LLAMA_STACK_API_V1ALPHA,
)
async def get_agents_session(
self,
session_id: str,
agent_id: str,
turn_ids: list[str] | None = None,
) -> Session:
"""Retrieve an agent session by its ID.
:param session_id: The ID of the session to get.
:param agent_id: The ID of the agent to get the session for.
:param turn_ids: (Optional) List of turn IDs to filter the session by.
:returns: A Session.
"""
...
@webmethod(
route="/agents/{agent_id}/session/{session_id}",
method="DELETE",
level=LLAMA_STACK_API_V1ALPHA,
)
async def delete_agents_session(
self,
session_id: str,
agent_id: str,
) -> None:
"""Delete an agent session by its ID and its associated turns.
:param session_id: The ID of the session to delete.
:param agent_id: The ID of the agent to delete the session for.
"""
...
@webmethod(route="/agents/{agent_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA)
async def delete_agent(
self,
agent_id: str,
) -> None:
"""Delete an agent by its ID and its associated sessions and turns.
:param agent_id: The ID of the agent to delete.
"""
...
@webmethod(route="/agents", method="GET", level=LLAMA_STACK_API_V1ALPHA)
async def list_agents(self, start_index: int | None = None, limit: int | None = None) -> PaginatedResponse:
"""List all agents.
:param start_index: The index to start the pagination from.
:param limit: The number of agents to return.
:returns: A PaginatedResponse.
"""
...
@webmethod(route="/agents/{agent_id}", method="GET", level=LLAMA_STACK_API_V1ALPHA)
async def get_agent(self, agent_id: str) -> Agent:
"""Describe an agent by its ID.
:param agent_id: ID of the agent.
:returns: An Agent of the agent.
"""
...
@webmethod(route="/agents/{agent_id}/sessions", method="GET", level=LLAMA_STACK_API_V1ALPHA)
async def list_agent_sessions(
self,
agent_id: str,
start_index: int | None = None,
limit: int | None = None,
) -> PaginatedResponse:
"""List all session(s) of a given agent.
:param agent_id: The ID of the agent to list sessions for.
:param start_index: The index to start the pagination from.
:param limit: The number of sessions to return.
:returns: A PaginatedResponse.
"""
...
# We situate the OpenAI Responses API in the Agents API just like we did things # We situate the OpenAI Responses API in the Agents API just like we did things
# for Inference. The Responses API, in its intent, serves the same purpose as # for Inference. The Responses API, in its intent, serves the same purpose as
# the Agents API above -- it is essentially a lightweight "agentic loop" with # the Agents API above -- it is essentially a lightweight "agentic loop" with
@ -750,6 +87,7 @@ class Agents(Protocol):
"List of guardrails to apply during response generation. Guardrails provide safety and content moderation." "List of guardrails to apply during response generation. Guardrails provide safety and content moderation."
), ),
] = None, ] = None,
max_tool_calls: int | None = None,
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]: ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
"""Create a model response. """Create a model response.
@ -760,6 +98,7 @@ class Agents(Protocol):
:param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation. :param conversation: (Optional) The ID of a conversation to add the response to. Must begin with 'conv_'. Input and output messages will be automatically added to the conversation.
:param include: (Optional) Additional fields to include in the response. :param include: (Optional) Additional fields to include in the response.
:param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications. :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications.
:param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response.
:returns: An OpenAIResponseObject. :returns: An OpenAIResponseObject.
""" """
... ...

View file

@ -403,7 +403,7 @@ class OpenAIResponseText(BaseModel):
# Must match type Literals of OpenAIResponseInputToolWebSearch below # Must match type Literals of OpenAIResponseInputToolWebSearch below
WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11"] WebSearchToolTypes = ["web_search", "web_search_preview", "web_search_preview_2025_03_11", "web_search_2025_08_26"]
@json_schema_type @json_schema_type
@ -415,9 +415,12 @@ class OpenAIResponseInputToolWebSearch(BaseModel):
""" """
# Must match values of WebSearchToolTypes above # Must match values of WebSearchToolTypes above
type: Literal["web_search"] | Literal["web_search_preview"] | Literal["web_search_preview_2025_03_11"] = ( type: (
"web_search" Literal["web_search"]
) | Literal["web_search_preview"]
| Literal["web_search_preview_2025_03_11"]
| Literal["web_search_2025_08_26"]
) = "web_search"
# TODO: actually use search_context_size somewhere... # TODO: actually use search_context_size somewhere...
search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$") search_context_size: str | None = Field(default="medium", pattern="^low|medium|high$")
# TODO: add user_location # TODO: add user_location
@ -591,6 +594,7 @@ class OpenAIResponseObject(BaseModel):
:param truncation: (Optional) Truncation strategy applied to the response :param truncation: (Optional) Truncation strategy applied to the response
:param usage: (Optional) Token usage information for the response :param usage: (Optional) Token usage information for the response
:param instructions: (Optional) System message inserted into the model's context :param instructions: (Optional) System message inserted into the model's context
:param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response
""" """
created_at: int created_at: int
@ -612,6 +616,7 @@ class OpenAIResponseObject(BaseModel):
truncation: str | None = None truncation: str | None = None
usage: OpenAIResponseUsage | None = None usage: OpenAIResponseUsage | None = None
instructions: str | None = None instructions: str | None = None
max_tool_calls: int | None = None
@json_schema_type @json_schema_type

View file

@ -74,7 +74,7 @@ class Benchmarks(Protocol):
""" """
... ...
@webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA) @webmethod(route="/eval/benchmarks", method="POST", level=LLAMA_STACK_API_V1ALPHA, deprecated=True)
async def register_benchmark( async def register_benchmark(
self, self,
benchmark_id: str, benchmark_id: str,
@ -95,7 +95,7 @@ class Benchmarks(Protocol):
""" """
... ...
@webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA) @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE", level=LLAMA_STACK_API_V1ALPHA, deprecated=True)
async def unregister_benchmark(self, benchmark_id: str) -> None: async def unregister_benchmark(self, benchmark_id: str) -> None:
"""Unregister a benchmark. """Unregister a benchmark.

View file

@ -56,14 +56,6 @@ class ToolGroupNotFoundError(ResourceNotFoundError):
super().__init__(toolgroup_name, "Tool Group", "client.toolgroups.list()") super().__init__(toolgroup_name, "Tool Group", "client.toolgroups.list()")
class SessionNotFoundError(ValueError):
"""raised when Llama Stack cannot find a referenced session or access is denied"""
def __init__(self, session_name: str) -> None:
message = f"Session '{session_name}' not found or access denied."
super().__init__(message)
class ModelTypeError(TypeError): class ModelTypeError(TypeError):
"""raised when a model is present but not the correct type""" """raised when a model is present but not the correct type"""

View file

@ -34,3 +34,44 @@ class PaginatedResponse(BaseModel):
data: list[dict[str, Any]] data: list[dict[str, Any]]
has_more: bool has_more: bool
url: str | None = None url: str | None = None
# This is a short term solution to allow inference API to return metrics
# The ideal way to do this is to have a way for all response types to include metrics
# and all metric events logged to the telemetry API to be included with the response
# To do this, we will need to augment all response types with a metrics field.
# We have hit a blocker from stainless SDK that prevents us from doing this.
# The blocker is that if we were to augment the response types that have a data field
# in them like so
# class ListModelsResponse(BaseModel):
# metrics: Optional[List[MetricEvent]] = None
# data: List[Models]
# ...
# The client SDK will need to access the data by using a .data field, which is not
# ergonomic. Stainless SDK does support unwrapping the response type, but it
# requires that the response type to only have a single field.
# We will need a way in the client SDK to signal that the metrics are needed
# and if they are needed, the client SDK has to return the full response type
# without unwrapping it.
@json_schema_type
class MetricInResponse(BaseModel):
"""A metric value included in API responses.
:param metric: The name of the metric
:param value: The numeric value of the metric
:param unit: (Optional) The unit of measurement for the metric value
"""
metric: str
value: int | float
unit: str | None = None
class MetricResponseMixin(BaseModel):
"""Mixin class for API responses that can include metrics.
:param metrics: (Optional) List of metrics associated with the API response
"""
metrics: list[MetricInResponse] | None = None

View file

@ -0,0 +1,22 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
def telemetry_traceable(cls):
"""
Mark a protocol for automatic tracing when telemetry is enabled.
This is a metadata-only decorator with no dependencies on core.
Actual tracing is applied by core routers at runtime if telemetry is enabled.
Usage:
@runtime_checkable
@telemetry_traceable
class MyProtocol(Protocol):
...
"""
cls.__marked_for_tracing__ = True
return cls

View file

@ -103,17 +103,6 @@ class CompletionInputType(BaseModel):
type: Literal["completion_input"] = "completion_input" type: Literal["completion_input"] = "completion_input"
@json_schema_type
class AgentTurnInputType(BaseModel):
"""Parameter type for agent turn input.
:param type: Discriminator type. Always "agent_turn_input"
"""
# expects List[Message] for messages (may also include attachments?)
type: Literal["agent_turn_input"] = "agent_turn_input"
@json_schema_type @json_schema_type
class DialogType(BaseModel): class DialogType(BaseModel):
"""Parameter type for dialog data with semantic output labels. """Parameter type for dialog data with semantic output labels.
@ -135,8 +124,7 @@ ParamType = Annotated[
| JsonType | JsonType
| UnionType | UnionType
| ChatCompletionInputType | ChatCompletionInputType
| CompletionInputType | CompletionInputType,
| AgentTurnInputType,
Field(discriminator="type"), Field(discriminator="type"),
] ]
register_schema(ParamType, name="ParamType") register_schema(ParamType, name="ParamType")

View file

@ -6,26 +6,22 @@
from .conversations import ( from .conversations import (
Conversation, Conversation,
ConversationCreateRequest,
ConversationDeletedResource, ConversationDeletedResource,
ConversationItem, ConversationItem,
ConversationItemCreateRequest, ConversationItemCreateRequest,
ConversationItemDeletedResource, ConversationItemDeletedResource,
ConversationItemList, ConversationItemList,
Conversations, Conversations,
ConversationUpdateRequest,
Metadata, Metadata,
) )
__all__ = [ __all__ = [
"Conversation", "Conversation",
"ConversationCreateRequest",
"ConversationDeletedResource", "ConversationDeletedResource",
"ConversationItem", "ConversationItem",
"ConversationItemCreateRequest", "ConversationItemCreateRequest",
"ConversationItemDeletedResource", "ConversationItemDeletedResource",
"ConversationItemList", "ConversationItemList",
"Conversations", "Conversations",
"ConversationUpdateRequest",
"Metadata", "Metadata",
] ]

View file

@ -20,8 +20,8 @@ from llama_stack.apis.agents.openai_responses import (
OpenAIResponseOutputMessageMCPListTools, OpenAIResponseOutputMessageMCPListTools,
OpenAIResponseOutputMessageWebSearchToolCall, OpenAIResponseOutputMessageWebSearchToolCall,
) )
from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.version import LLAMA_STACK_API_V1 from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
Metadata = dict[str, str] Metadata = dict[str, str]
@ -102,32 +102,6 @@ register_schema(ConversationItem, name="ConversationItem")
# ] # ]
@json_schema_type
class ConversationCreateRequest(BaseModel):
"""Request body for creating a conversation."""
items: list[ConversationItem] | None = Field(
default=[],
description="Initial items to include in the conversation context. You may add up to 20 items at a time.",
max_length=20,
)
metadata: Metadata | None = Field(
default={},
description="Set of 16 key-value pairs that can be attached to an object. Useful for storing additional information",
max_length=16,
)
@json_schema_type
class ConversationUpdateRequest(BaseModel):
"""Request body for updating a conversation."""
metadata: Metadata = Field(
...,
description="Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard. Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters.",
)
@json_schema_type @json_schema_type
class ConversationDeletedResource(BaseModel): class ConversationDeletedResource(BaseModel):
"""Response for deleted conversation.""" """Response for deleted conversation."""
@ -183,7 +157,7 @@ class ConversationItemDeletedResource(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class Conversations(Protocol): class Conversations(Protocol):
"""Conversations """Conversations

View file

@ -146,7 +146,7 @@ class ListDatasetsResponse(BaseModel):
class Datasets(Protocol): class Datasets(Protocol):
@webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1BETA) @webmethod(route="/datasets", method="POST", level=LLAMA_STACK_API_V1BETA, deprecated=True)
async def register_dataset( async def register_dataset(
self, self,
purpose: DatasetPurpose, purpose: DatasetPurpose,
@ -235,7 +235,7 @@ class Datasets(Protocol):
""" """
... ...
@webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1BETA) @webmethod(route="/datasets/{dataset_id:path}", method="DELETE", level=LLAMA_STACK_API_V1BETA, deprecated=True)
async def unregister_dataset( async def unregister_dataset(
self, self,
dataset_id: str, dataset_id: str,

View file

@ -4,17 +4,16 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import Annotated, Any, Literal, Protocol from typing import Any, Literal, Protocol
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.agents import AgentConfig
from llama_stack.apis.common.job_types import Job from llama_stack.apis.common.job_types import Job
from llama_stack.apis.inference import SamplingParams, SystemMessage from llama_stack.apis.inference import SamplingParams, SystemMessage
from llama_stack.apis.scoring import ScoringResult from llama_stack.apis.scoring import ScoringResult
from llama_stack.apis.scoring_functions import ScoringFnParams from llama_stack.apis.scoring_functions import ScoringFnParams
from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA from llama_stack.apis.version import LLAMA_STACK_API_V1ALPHA
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type @json_schema_type
@ -32,19 +31,7 @@ class ModelCandidate(BaseModel):
system_message: SystemMessage | None = None system_message: SystemMessage | None = None
@json_schema_type EvalCandidate = ModelCandidate
class AgentCandidate(BaseModel):
"""An agent candidate for evaluation.
:param config: The configuration for the agent candidate.
"""
type: Literal["agent"] = "agent"
config: AgentConfig
EvalCandidate = Annotated[ModelCandidate | AgentCandidate, Field(discriminator="type")]
register_schema(EvalCandidate, name="EvalCandidate")
@json_schema_type @json_schema_type

View file

@ -11,8 +11,8 @@ from fastapi import File, Form, Response, UploadFile
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.common.responses import Order from llama_stack.apis.common.responses import Order
from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.version import LLAMA_STACK_API_V1 from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
@ -102,7 +102,7 @@ class OpenAIFileDeleteResponse(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class Files(Protocol): class Files(Protocol):
"""Files """Files

View file

@ -1,43 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from termcolor import cprint
from llama_stack.apis.inference import (
ChatCompletionResponseEventType,
ChatCompletionResponseStreamChunk,
)
class LogEvent:
def __init__(
self,
content: str = "",
end: str = "\n",
color="white",
):
self.content = content
self.color = color
self.end = "\n" if end is None else end
def print(self, flush=True):
cprint(f"{self.content}", color=self.color, end=self.end, flush=flush)
class EventLogger:
async def log(self, event_generator):
async for chunk in event_generator:
if isinstance(chunk, ChatCompletionResponseStreamChunk):
event = chunk.event
if event.event_type == ChatCompletionResponseEventType.start:
yield LogEvent("Assistant> ", color="cyan", end="")
elif event.event_type == ChatCompletionResponseEventType.progress:
yield LogEvent(event.delta, color="yellow", end="")
elif event.event_type == ChatCompletionResponseEventType.complete:
yield LogEvent("")
else:
yield LogEvent("Assistant> ", color="cyan", end="")
yield LogEvent(chunk.completion_message.content, color="yellow")

View file

@ -5,7 +5,7 @@
# the root directory of this source tree. # the root directory of this source tree.
from collections.abc import AsyncIterator from collections.abc import AsyncIterator
from enum import Enum from enum import Enum, StrEnum
from typing import ( from typing import (
Annotated, Annotated,
Any, Any,
@ -15,29 +15,18 @@ from typing import (
) )
from fastapi import Body from fastapi import Body
from pydantic import BaseModel, Field, field_validator from pydantic import BaseModel, Field
from typing_extensions import TypedDict from typing_extensions import TypedDict
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent from llama_stack.apis.common.content_types import InterleavedContent
from llama_stack.apis.common.responses import Order from llama_stack.apis.common.responses import (
Order,
)
from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.models import Model from llama_stack.apis.models import Model
from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
from llama_stack.core.telemetry.telemetry import MetricResponseMixin
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.models.llama.datatypes import (
BuiltinTool,
StopReason,
ToolCall,
ToolDefinition,
ToolPromptFormat,
)
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
register_schema(ToolCall)
register_schema(ToolDefinition)
from enum import StrEnum
@json_schema_type @json_schema_type
class GreedySamplingStrategy(BaseModel): class GreedySamplingStrategy(BaseModel):
@ -202,58 +191,6 @@ class ToolResponseMessage(BaseModel):
content: InterleavedContent content: InterleavedContent
@json_schema_type
class CompletionMessage(BaseModel):
"""A message containing the model's (assistant) response in a chat conversation.
:param role: Must be "assistant" to identify this as the model's response
:param content: The content of the model's response
:param stop_reason: Reason why the model stopped generating. Options are:
- `StopReason.end_of_turn`: The model finished generating the entire response.
- `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response.
- `StopReason.out_of_tokens`: The model ran out of token budget.
:param tool_calls: List of tool calls. Each tool call is a ToolCall object.
"""
role: Literal["assistant"] = "assistant"
content: InterleavedContent
stop_reason: StopReason
tool_calls: list[ToolCall] | None = Field(default_factory=lambda: [])
Message = Annotated[
UserMessage | SystemMessage | ToolResponseMessage | CompletionMessage,
Field(discriminator="role"),
]
register_schema(Message, name="Message")
@json_schema_type
class ToolResponse(BaseModel):
"""Response from a tool invocation.
:param call_id: Unique identifier for the tool call this response is for
:param tool_name: Name of the tool that was invoked
:param content: The response content from the tool
:param metadata: (Optional) Additional metadata about the tool response
"""
call_id: str
tool_name: BuiltinTool | str
content: InterleavedContent
metadata: dict[str, Any] | None = None
@field_validator("tool_name", mode="before")
@classmethod
def validate_field(cls, v):
if isinstance(v, str):
try:
return BuiltinTool(v)
except ValueError:
return v
return v
class ToolChoice(Enum): class ToolChoice(Enum):
"""Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model. """Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.
@ -290,22 +227,6 @@ class ChatCompletionResponseEventType(Enum):
progress = "progress" progress = "progress"
@json_schema_type
class ChatCompletionResponseEvent(BaseModel):
"""An event during chat completion generation.
:param event_type: Type of the event
:param delta: Content generated since last event. This can be one or more tokens, or a tool call.
:param logprobs: Optional log probabilities for generated tokens
:param stop_reason: Optional reason why generation stopped, if complete
"""
event_type: ChatCompletionResponseEventType
delta: ContentDelta
logprobs: list[TokenLogProbs] | None = None
stop_reason: StopReason | None = None
class ResponseFormatType(StrEnum): class ResponseFormatType(StrEnum):
"""Types of formats for structured (guided) decoding. """Types of formats for structured (guided) decoding.
@ -358,34 +279,6 @@ class CompletionRequest(BaseModel):
logprobs: LogProbConfig | None = None logprobs: LogProbConfig | None = None
@json_schema_type
class CompletionResponse(MetricResponseMixin):
"""Response from a completion request.
:param content: The generated completion text
:param stop_reason: Reason why generation stopped
:param logprobs: Optional log probabilities for generated tokens
"""
content: str
stop_reason: StopReason
logprobs: list[TokenLogProbs] | None = None
@json_schema_type
class CompletionResponseStreamChunk(MetricResponseMixin):
"""A chunk of a streamed completion response.
:param delta: New content generated since last chunk. This can be one or more tokens.
:param stop_reason: Optional reason why generation stopped, if complete
:param logprobs: Optional log probabilities for generated tokens
"""
delta: str
stop_reason: StopReason | None = None
logprobs: list[TokenLogProbs] | None = None
class SystemMessageBehavior(Enum): class SystemMessageBehavior(Enum):
"""Config for how to override the default system prompt. """Config for how to override the default system prompt.
@ -399,70 +292,6 @@ class SystemMessageBehavior(Enum):
replace = "replace" replace = "replace"
@json_schema_type
class ToolConfig(BaseModel):
"""Configuration for tool use.
:param tool_choice: (Optional) Whether tool use is automatic, required, or none. Can also specify a tool name to use a specific tool. Defaults to ToolChoice.auto.
:param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
- `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
:param system_message_behavior: (Optional) Config for how to override the default system prompt.
- `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt.
- `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string
'{{function_definitions}}' to indicate where the function definitions should be inserted.
"""
tool_choice: ToolChoice | str | None = Field(default=ToolChoice.auto)
tool_prompt_format: ToolPromptFormat | None = Field(default=None)
system_message_behavior: SystemMessageBehavior | None = Field(default=SystemMessageBehavior.append)
def model_post_init(self, __context: Any) -> None:
if isinstance(self.tool_choice, str):
try:
self.tool_choice = ToolChoice[self.tool_choice]
except KeyError:
pass
# This is an internally used class
@json_schema_type
class ChatCompletionRequest(BaseModel):
model: str
messages: list[Message]
sampling_params: SamplingParams | None = Field(default_factory=SamplingParams)
tools: list[ToolDefinition] | None = Field(default_factory=lambda: [])
tool_config: ToolConfig | None = Field(default_factory=ToolConfig)
response_format: ResponseFormat | None = None
stream: bool | None = False
logprobs: LogProbConfig | None = None
@json_schema_type
class ChatCompletionResponseStreamChunk(MetricResponseMixin):
"""A chunk of a streamed chat completion response.
:param event: The event containing the new content
"""
event: ChatCompletionResponseEvent
@json_schema_type
class ChatCompletionResponse(MetricResponseMixin):
"""Response from a chat completion request.
:param completion_message: The complete response message
:param logprobs: Optional log probabilities for generated tokens
"""
completion_message: CompletionMessage
logprobs: list[TokenLogProbs] | None = None
@json_schema_type @json_schema_type
class EmbeddingsResponse(BaseModel): class EmbeddingsResponse(BaseModel):
"""Response containing generated embeddings. """Response containing generated embeddings.
@ -1160,7 +989,7 @@ class OpenAIEmbeddingsRequestWithExtraBody(BaseModel, extra="allow"):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class InferenceProvider(Protocol): class InferenceProvider(Protocol):
""" """
This protocol defines the interface that should be implemented by all inference providers. This protocol defines the interface that should be implemented by all inference providers.

View file

@ -76,7 +76,7 @@ class Inspect(Protocol):
List all available API routes with their methods and implementing providers. List all available API routes with their methods and implementing providers.
:param api_filter: Optional filter to control which routes are returned. Can be an API level ('v1', 'v1alpha', 'v1beta') to show non-deprecated routes at that level, or 'deprecated' to show deprecated routes across all levels. If not specified, returns only non-deprecated v1 routes. :param api_filter: Optional filter to control which routes are returned. Can be an API level ('v1', 'v1alpha', 'v1beta') to show non-deprecated routes at that level, or 'deprecated' to show deprecated routes across all levels. If not specified, returns all non-deprecated routes.
:returns: Response containing information about all available routes. :returns: Response containing information about all available routes.
""" """
... ...

View file

@ -9,9 +9,9 @@ from typing import Any, Literal, Protocol, runtime_checkable
from pydantic import BaseModel, ConfigDict, Field, field_validator from pydantic import BaseModel, ConfigDict, Field, field_validator
from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.resource import Resource, ResourceType from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.apis.version import LLAMA_STACK_API_V1 from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
@ -105,7 +105,7 @@ class OpenAIListModelsResponse(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class Models(Protocol): class Models(Protocol):
async def list_models(self) -> ListModelsResponse: async def list_models(self) -> ListModelsResponse:
"""List all models. """List all models.
@ -136,7 +136,7 @@ class Models(Protocol):
""" """
... ...
@webmethod(route="/models", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/models", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
async def register_model( async def register_model(
self, self,
model_id: str, model_id: str,
@ -158,7 +158,7 @@ class Models(Protocol):
""" """
... ...
@webmethod(route="/models/{model_id:path}", method="DELETE", level=LLAMA_STACK_API_V1) @webmethod(route="/models/{model_id:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
async def unregister_model( async def unregister_model(
self, self,
model_id: str, model_id: str,

View file

@ -10,8 +10,8 @@ from typing import Protocol, runtime_checkable
from pydantic import BaseModel, Field, field_validator, model_validator from pydantic import BaseModel, Field, field_validator, model_validator
from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.version import LLAMA_STACK_API_V1 from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
@ -92,7 +92,7 @@ class ListPromptsResponse(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class Prompts(Protocol): class Prompts(Protocol):
"""Prompts """Prompts

View file

@ -9,10 +9,10 @@ from typing import Any, Protocol, runtime_checkable
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.inference import OpenAIMessageParam from llama_stack.apis.inference import OpenAIMessageParam
from llama_stack.apis.shields import Shield from llama_stack.apis.shields import Shield
from llama_stack.apis.version import LLAMA_STACK_API_V1 from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
@ -94,7 +94,7 @@ class ShieldStore(Protocol):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class Safety(Protocol): class Safety(Protocol):
"""Safety """Safety

View file

@ -178,7 +178,7 @@ class ScoringFunctions(Protocol):
""" """
... ...
@webmethod(route="/scoring-functions", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/scoring-functions", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
async def register_scoring_function( async def register_scoring_function(
self, self,
scoring_fn_id: str, scoring_fn_id: str,
@ -199,7 +199,9 @@ class ScoringFunctions(Protocol):
""" """
... ...
@webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE", level=LLAMA_STACK_API_V1) @webmethod(
route="/scoring-functions/{scoring_fn_id:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True
)
async def unregister_scoring_function(self, scoring_fn_id: str) -> None: async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
"""Unregister a scoring function. """Unregister a scoring function.

View file

@ -8,9 +8,9 @@ from typing import Any, Literal, Protocol, runtime_checkable
from pydantic import BaseModel from pydantic import BaseModel
from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.resource import Resource, ResourceType from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.apis.version import LLAMA_STACK_API_V1 from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
@ -48,7 +48,7 @@ class ListShieldsResponse(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class Shields(Protocol): class Shields(Protocol):
@webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/shields", method="GET", level=LLAMA_STACK_API_V1)
async def list_shields(self) -> ListShieldsResponse: async def list_shields(self) -> ListShieldsResponse:
@ -67,7 +67,7 @@ class Shields(Protocol):
""" """
... ...
@webmethod(route="/shields", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/shields", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
async def register_shield( async def register_shield(
self, self,
shield_id: str, shield_id: str,
@ -85,7 +85,7 @@ class Shields(Protocol):
""" """
... ...
@webmethod(route="/shields/{identifier:path}", method="DELETE", level=LLAMA_STACK_API_V1) @webmethod(route="/shields/{identifier:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
async def unregister_shield(self, identifier: str) -> None: async def unregister_shield(self, identifier: str) -> None:
"""Unregister a shield. """Unregister a shield.

View file

@ -5,18 +5,13 @@
# the root directory of this source tree. # the root directory of this source tree.
from enum import Enum, StrEnum from enum import Enum, StrEnum
from typing import Annotated, Any, Literal, Protocol from typing import Annotated, Any, Literal
from pydantic import BaseModel, Field, field_validator from pydantic import BaseModel, Field, field_validator
from typing_extensions import runtime_checkable
from llama_stack.apis.common.content_types import URL, InterleavedContent from llama_stack.apis.common.content_types import URL, InterleavedContent
from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod
@json_schema_type
class RRFRanker(BaseModel): class RRFRanker(BaseModel):
""" """
Reciprocal Rank Fusion (RRF) ranker configuration. Reciprocal Rank Fusion (RRF) ranker configuration.
@ -30,7 +25,6 @@ class RRFRanker(BaseModel):
impact_factor: float = Field(default=60.0, gt=0.0) # default of 60 for optimal performance impact_factor: float = Field(default=60.0, gt=0.0) # default of 60 for optimal performance
@json_schema_type
class WeightedRanker(BaseModel): class WeightedRanker(BaseModel):
""" """
Weighted ranker configuration that combines vector and keyword scores. Weighted ranker configuration that combines vector and keyword scores.
@ -55,10 +49,8 @@ Ranker = Annotated[
RRFRanker | WeightedRanker, RRFRanker | WeightedRanker,
Field(discriminator="type"), Field(discriminator="type"),
] ]
register_schema(Ranker, name="Ranker")
@json_schema_type
class RAGDocument(BaseModel): class RAGDocument(BaseModel):
""" """
A document to be used for document ingestion in the RAG Tool. A document to be used for document ingestion in the RAG Tool.
@ -75,7 +67,6 @@ class RAGDocument(BaseModel):
metadata: dict[str, Any] = Field(default_factory=dict) metadata: dict[str, Any] = Field(default_factory=dict)
@json_schema_type
class RAGQueryResult(BaseModel): class RAGQueryResult(BaseModel):
"""Result of a RAG query containing retrieved content and metadata. """Result of a RAG query containing retrieved content and metadata.
@ -87,7 +78,6 @@ class RAGQueryResult(BaseModel):
metadata: dict[str, Any] = Field(default_factory=dict) metadata: dict[str, Any] = Field(default_factory=dict)
@json_schema_type
class RAGQueryGenerator(Enum): class RAGQueryGenerator(Enum):
"""Types of query generators for RAG systems. """Types of query generators for RAG systems.
@ -101,7 +91,6 @@ class RAGQueryGenerator(Enum):
custom = "custom" custom = "custom"
@json_schema_type
class RAGSearchMode(StrEnum): class RAGSearchMode(StrEnum):
""" """
Search modes for RAG query retrieval: Search modes for RAG query retrieval:
@ -115,7 +104,6 @@ class RAGSearchMode(StrEnum):
HYBRID = "hybrid" HYBRID = "hybrid"
@json_schema_type
class DefaultRAGQueryGeneratorConfig(BaseModel): class DefaultRAGQueryGeneratorConfig(BaseModel):
"""Configuration for the default RAG query generator. """Configuration for the default RAG query generator.
@ -127,7 +115,6 @@ class DefaultRAGQueryGeneratorConfig(BaseModel):
separator: str = " " separator: str = " "
@json_schema_type
class LLMRAGQueryGeneratorConfig(BaseModel): class LLMRAGQueryGeneratorConfig(BaseModel):
"""Configuration for the LLM-based RAG query generator. """Configuration for the LLM-based RAG query generator.
@ -145,10 +132,8 @@ RAGQueryGeneratorConfig = Annotated[
DefaultRAGQueryGeneratorConfig | LLMRAGQueryGeneratorConfig, DefaultRAGQueryGeneratorConfig | LLMRAGQueryGeneratorConfig,
Field(discriminator="type"), Field(discriminator="type"),
] ]
register_schema(RAGQueryGeneratorConfig, name="RAGQueryGeneratorConfig")
@json_schema_type
class RAGQueryConfig(BaseModel): class RAGQueryConfig(BaseModel):
""" """
Configuration for the RAG query generation. Configuration for the RAG query generation.
@ -181,38 +166,3 @@ class RAGQueryConfig(BaseModel):
if len(v) == 0: if len(v) == 0:
raise ValueError("chunk_template must not be empty") raise ValueError("chunk_template must not be empty")
return v return v
@runtime_checkable
@trace_protocol
class RAGToolRuntime(Protocol):
@webmethod(route="/tool-runtime/rag-tool/insert", method="POST", level=LLAMA_STACK_API_V1)
async def insert(
self,
documents: list[RAGDocument],
vector_store_id: str,
chunk_size_in_tokens: int = 512,
) -> None:
"""Index documents so they can be used by the RAG system.
:param documents: List of documents to index in the RAG system
:param vector_store_id: ID of the vector database to store the document embeddings
:param chunk_size_in_tokens: (Optional) Size in tokens for document chunking during indexing
"""
...
@webmethod(route="/tool-runtime/rag-tool/query", method="POST", level=LLAMA_STACK_API_V1)
async def query(
self,
content: InterleavedContent,
vector_store_ids: list[str],
query_config: RAGQueryConfig | None = None,
) -> RAGQueryResult:
"""Query the RAG system for context; typically invoked by the agent.
:param content: The query content to search for in the indexed documents
:param vector_store_ids: List of vector database IDs to search within
:param query_config: (Optional) Configuration parameters for the query operation
:returns: RAGQueryResult containing the retrieved content and metadata
"""
...

View file

@ -11,13 +11,11 @@ from pydantic import BaseModel
from typing_extensions import runtime_checkable from typing_extensions import runtime_checkable
from llama_stack.apis.common.content_types import URL, InterleavedContent from llama_stack.apis.common.content_types import URL, InterleavedContent
from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.resource import Resource, ResourceType from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.apis.version import LLAMA_STACK_API_V1 from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
from .rag_tool import RAGToolRuntime
@json_schema_type @json_schema_type
class ToolDef(BaseModel): class ToolDef(BaseModel):
@ -109,9 +107,9 @@ class ListToolDefsResponse(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class ToolGroups(Protocol): class ToolGroups(Protocol):
@webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/toolgroups", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
async def register_tool_group( async def register_tool_group(
self, self,
toolgroup_id: str, toolgroup_id: str,
@ -169,7 +167,7 @@ class ToolGroups(Protocol):
""" """
... ...
@webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE", level=LLAMA_STACK_API_V1) @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
async def unregister_toolgroup( async def unregister_toolgroup(
self, self,
toolgroup_id: str, toolgroup_id: str,
@ -191,12 +189,10 @@ class SpecialToolGroup(Enum):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class ToolRuntime(Protocol): class ToolRuntime(Protocol):
tool_store: ToolStore | None = None tool_store: ToolStore | None = None
rag_tool: RAGToolRuntime | None = None
# TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed. # TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed.
@webmethod(route="/tool-runtime/list-tools", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/tool-runtime/list-tools", method="GET", level=LLAMA_STACK_API_V1)
async def list_runtime_tools( async def list_runtime_tools(

View file

@ -10,13 +10,13 @@
# the root directory of this source tree. # the root directory of this source tree.
from typing import Annotated, Any, Literal, Protocol, runtime_checkable from typing import Annotated, Any, Literal, Protocol, runtime_checkable
from fastapi import Body from fastapi import Body, Query
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.common.tracing import telemetry_traceable
from llama_stack.apis.inference import InterleavedContent from llama_stack.apis.inference import InterleavedContent
from llama_stack.apis.vector_stores import VectorStore from llama_stack.apis.vector_stores import VectorStore
from llama_stack.apis.version import LLAMA_STACK_API_V1 from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.telemetry.trace_protocol import trace_protocol
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import json_schema_type, webmethod
from llama_stack.strong_typing.schema import register_schema from llama_stack.strong_typing.schema import register_schema
@ -224,10 +224,16 @@ class VectorStoreContent(BaseModel):
:param type: Content type, currently only "text" is supported :param type: Content type, currently only "text" is supported
:param text: The actual text content :param text: The actual text content
:param embedding: Optional embedding vector for this content chunk
:param chunk_metadata: Optional chunk metadata
:param metadata: Optional user-defined metadata
""" """
type: Literal["text"] type: Literal["text"]
text: str text: str
embedding: list[float] | None = None
chunk_metadata: ChunkMetadata | None = None
metadata: dict[str, Any] | None = None
@json_schema_type @json_schema_type
@ -260,7 +266,7 @@ class VectorStoreSearchResponsePage(BaseModel):
""" """
object: str = "vector_store.search_results.page" object: str = "vector_store.search_results.page"
search_query: str search_query: list[str]
data: list[VectorStoreSearchResponse] data: list[VectorStoreSearchResponse]
has_more: bool = False has_more: bool = False
next_page: str | None = None next_page: str | None = None
@ -280,6 +286,22 @@ class VectorStoreDeleteResponse(BaseModel):
deleted: bool = True deleted: bool = True
@json_schema_type
class VectorStoreFileContentResponse(BaseModel):
"""Represents the parsed content of a vector store file.
:param object: The object type, which is always `vector_store.file_content.page`
:param data: Parsed content of the file
:param has_more: Indicates if there are more content pages to fetch
:param next_page: The token for the next page, if any
"""
object: Literal["vector_store.file_content.page"] = "vector_store.file_content.page"
data: list[VectorStoreContent]
has_more: bool = False
next_page: str | None = None
@json_schema_type @json_schema_type
class VectorStoreChunkingStrategyAuto(BaseModel): class VectorStoreChunkingStrategyAuto(BaseModel):
"""Automatic chunking strategy for vector store files. """Automatic chunking strategy for vector store files.
@ -395,22 +417,6 @@ class VectorStoreListFilesResponse(BaseModel):
has_more: bool = False has_more: bool = False
@json_schema_type
class VectorStoreFileContentsResponse(BaseModel):
"""Response from retrieving the contents of a vector store file.
:param file_id: Unique identifier for the file
:param filename: Name of the file
:param attributes: Key-value attributes associated with the file
:param content: List of content items from the file
"""
file_id: str
filename: str
attributes: dict[str, Any]
content: list[VectorStoreContent]
@json_schema_type @json_schema_type
class VectorStoreFileDeleteResponse(BaseModel): class VectorStoreFileDeleteResponse(BaseModel):
"""Response from deleting a vector store file. """Response from deleting a vector store file.
@ -478,7 +484,7 @@ class OpenAICreateVectorStoreRequestWithExtraBody(BaseModel, extra="allow"):
name: str | None = None name: str | None = None
file_ids: list[str] | None = None file_ids: list[str] | None = None
expires_after: dict[str, Any] | None = None expires_after: dict[str, Any] | None = None
chunking_strategy: dict[str, Any] | None = None chunking_strategy: VectorStoreChunkingStrategy | None = None
metadata: dict[str, Any] | None = None metadata: dict[str, Any] | None = None
@ -502,7 +508,7 @@ class VectorStoreTable(Protocol):
@runtime_checkable @runtime_checkable
@trace_protocol @telemetry_traceable
class VectorIO(Protocol): class VectorIO(Protocol):
vector_store_table: VectorStoreTable | None = None vector_store_table: VectorStoreTable | None = None
@ -732,12 +738,16 @@ class VectorIO(Protocol):
self, self,
vector_store_id: str, vector_store_id: str,
file_id: str, file_id: str,
) -> VectorStoreFileContentsResponse: include_embeddings: Annotated[bool | None, Query(default=False)] = False,
include_metadata: Annotated[bool | None, Query(default=False)] = False,
) -> VectorStoreFileContentResponse:
"""Retrieves the contents of a vector store file. """Retrieves the contents of a vector store file.
:param vector_store_id: The ID of the vector store containing the file to retrieve. :param vector_store_id: The ID of the vector store containing the file to retrieve.
:param file_id: The ID of the file to retrieve. :param file_id: The ID of the file to retrieve.
:returns: A list of InterleavedContent representing the file contents. :param include_embeddings: Whether to include embedding vectors in the response.
:param include_metadata: Whether to include chunk metadata in the response.
:returns: File contents, optionally with embeddings and metadata based on query parameters.
""" """
... ...

View file

@ -46,6 +46,10 @@ class StackListDeps(Subcommand):
def _run_stack_list_deps_command(self, args: argparse.Namespace) -> None: def _run_stack_list_deps_command(self, args: argparse.Namespace) -> None:
# always keep implementation completely silo-ed away from CLI so CLI # always keep implementation completely silo-ed away from CLI so CLI
# can be fast to load and reduces dependencies # can be fast to load and reduces dependencies
if not args.config and not args.providers:
self.parser.print_help()
self.parser.exit()
from ._list_deps import run_stack_list_deps_command from ._list_deps import run_stack_list_deps_command
return run_stack_list_deps_command(args) return run_stack_list_deps_command(args)

View file

@ -9,48 +9,69 @@ from pathlib import Path
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
from llama_stack.cli.table import print_table from llama_stack.cli.table import print_table
from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
class StackListBuilds(Subcommand): class StackListBuilds(Subcommand):
"""List built stacks in .llama/distributions directory""" """List available distributions (both built-in and custom)"""
def __init__(self, subparsers: argparse._SubParsersAction): def __init__(self, subparsers: argparse._SubParsersAction):
super().__init__() super().__init__()
self.parser = subparsers.add_parser( self.parser = subparsers.add_parser(
"list", "list",
prog="llama stack list", prog="llama stack list",
description="list the build stacks", description="list available distributions",
formatter_class=argparse.ArgumentDefaultsHelpFormatter, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
) )
self._add_arguments() self._add_arguments()
self.parser.set_defaults(func=self._list_stack_command) self.parser.set_defaults(func=self._list_stack_command)
def _get_distribution_dirs(self) -> dict[str, Path]: def _get_distribution_dirs(self) -> dict[str, tuple[Path, str]]:
"""Return a dictionary of distribution names and their paths""" """Return a dictionary of distribution names and their paths with source type
distributions = {}
dist_dir = Path.home() / ".llama" / "distributions" Returns:
dict mapping distro name to (path, source_type) where source_type is 'built-in' or 'custom'
"""
distributions = {}
# Get built-in distributions from source code
distro_dir = Path(__file__).parent.parent.parent / "distributions"
if distro_dir.exists():
for stack_dir in distro_dir.iterdir():
if stack_dir.is_dir() and not stack_dir.name.startswith(".") and not stack_dir.name.startswith("__"):
distributions[stack_dir.name] = (stack_dir, "built-in")
# Get custom/run distributions from ~/.llama/distributions
# These override built-in ones if they have the same name
if DISTRIBS_BASE_DIR.exists():
for stack_dir in DISTRIBS_BASE_DIR.iterdir():
if stack_dir.is_dir() and not stack_dir.name.startswith("."):
# Clean up the name (remove llamastack- prefix if present)
name = stack_dir.name.replace("llamastack-", "")
distributions[name] = (stack_dir, "custom")
if dist_dir.exists():
for stack_dir in dist_dir.iterdir():
if stack_dir.is_dir():
distributions[stack_dir.name] = stack_dir
return distributions return distributions
def _list_stack_command(self, args: argparse.Namespace) -> None: def _list_stack_command(self, args: argparse.Namespace) -> None:
distributions = self._get_distribution_dirs() distributions = self._get_distribution_dirs()
if not distributions: if not distributions:
print("No stacks found in ~/.llama/distributions") print("No distributions found")
return return
headers = ["Stack Name", "Path"] headers = ["Stack Name", "Source", "Path", "Build Config", "Run Config"]
headers.extend(["Build Config", "Run Config"])
rows = [] rows = []
for name, path in distributions.items(): for name, (path, source_type) in sorted(distributions.items()):
row = [name, str(path)] row = [name, source_type, str(path)]
# Check for build and run config files # Check for build and run config files
build_config = "Yes" if (path / f"{name}-build.yaml").exists() else "No" # For built-in distributions, configs are named build.yaml and run.yaml
run_config = "Yes" if (path / f"{name}-run.yaml").exists() else "No" # For custom distributions, configs are named {name}-build.yaml and {name}-run.yaml
if source_type == "built-in":
build_config = "Yes" if (path / "build.yaml").exists() else "No"
run_config = "Yes" if (path / "run.yaml").exists() else "No"
else:
build_config = "Yes" if (path / f"{name}-build.yaml").exists() else "No"
run_config = "Yes" if (path / f"{name}-run.yaml").exists() else "No"
row.extend([build_config, run_config]) row.extend([build_config, run_config])
rows.append(row) rows.append(row)
print_table(rows, headers, separate_rows=True) print_table(rows, headers, separate_rows=True)

View file

@ -253,7 +253,7 @@ class StackRun(Subcommand):
) )
return return
ui_dir = REPO_ROOT / "llama_stack" / "ui" ui_dir = REPO_ROOT / "llama_stack_ui"
logs_dir = Path("~/.llama/ui/logs").expanduser() logs_dir = Path("~/.llama/ui/logs").expanduser()
try: try:
# Create logs directory if it doesn't exist # Create logs directory if it doesn't exist

View file

@ -4,7 +4,6 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import importlib.resources
import sys import sys
from pydantic import BaseModel from pydantic import BaseModel
@ -12,9 +11,6 @@ from termcolor import cprint
from llama_stack.core.datatypes import BuildConfig from llama_stack.core.datatypes import BuildConfig
from llama_stack.core.distribution import get_provider_registry from llama_stack.core.distribution import get_provider_registry
from llama_stack.core.external import load_external_apis
from llama_stack.core.utils.exec import run_command
from llama_stack.core.utils.image_types import LlamaStackImageType
from llama_stack.distributions.template import DistributionTemplate from llama_stack.distributions.template import DistributionTemplate
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
@ -101,64 +97,3 @@ def print_pip_install_help(config: BuildConfig):
for special_dep in special_deps: for special_dep in special_deps:
cprint(f"uv pip install {special_dep}", color="yellow", file=sys.stderr) cprint(f"uv pip install {special_dep}", color="yellow", file=sys.stderr)
print() print()
def build_image(
build_config: BuildConfig,
image_name: str,
distro_or_config: str,
run_config: str | None = None,
):
container_base = build_config.distribution_spec.container_image or "python:3.12-slim"
normal_deps, special_deps, external_provider_deps = get_provider_dependencies(build_config)
normal_deps += SERVER_DEPENDENCIES
if build_config.external_apis_dir:
external_apis = load_external_apis(build_config)
if external_apis:
for _, api_spec in external_apis.items():
normal_deps.extend(api_spec.pip_packages)
if build_config.image_type == LlamaStackImageType.CONTAINER.value:
script = str(importlib.resources.files("llama_stack") / "core/build_container.sh")
args = [
script,
"--distro-or-config",
distro_or_config,
"--image-name",
image_name,
"--container-base",
container_base,
"--normal-deps",
" ".join(normal_deps),
]
# When building from a config file (not a template), include the run config path in the
# build arguments
if run_config is not None:
args.extend(["--run-config", run_config])
else:
script = str(importlib.resources.files("llama_stack") / "core/build_venv.sh")
args = [
script,
"--env-name",
str(image_name),
"--normal-deps",
" ".join(normal_deps),
]
# Always pass both arguments, even if empty, to maintain consistent positional arguments
if special_deps:
args.extend(["--optional-deps", "#".join(special_deps)])
if external_provider_deps:
args.extend(
["--external-provider-deps", "#".join(external_provider_deps)]
) # the script will install external provider module, get its deps, and install those too.
return_code = run_command(args)
if return_code != 0:
log.error(
f"Failed to build target {image_name} with return code {return_code}",
)
return return_code

View file

@ -203,16 +203,11 @@ class ConversationServiceImpl(Conversations):
"item_data": item_dict, "item_data": item_dict,
} }
# TODO: Add support for upsert in sql_store, this will fail first if ID exists and then update await self.sql_store.upsert(
try: table="conversation_items",
await self.sql_store.insert(table="conversation_items", data=item_record) data=item_record,
except Exception: conflict_columns=["id"],
# If insert fails due to ID conflict, update existing record )
await self.sql_store.update(
table="conversation_items",
data={"created_at": created_at, "item_data": item_dict},
where={"id": item_id},
)
created_items.append(item_dict) created_items.append(item_dict)

View file

@ -15,7 +15,6 @@ from llama_stack.apis.inspect import (
RouteInfo, RouteInfo,
VersionInfo, VersionInfo,
) )
from llama_stack.apis.version import LLAMA_STACK_API_V1
from llama_stack.core.datatypes import StackRunConfig from llama_stack.core.datatypes import StackRunConfig
from llama_stack.core.external import load_external_apis from llama_stack.core.external import load_external_apis
from llama_stack.core.server.routes import get_all_api_routes from llama_stack.core.server.routes import get_all_api_routes
@ -46,8 +45,8 @@ class DistributionInspectImpl(Inspect):
# Helper function to determine if a route should be included based on api_filter # Helper function to determine if a route should be included based on api_filter
def should_include_route(webmethod) -> bool: def should_include_route(webmethod) -> bool:
if api_filter is None: if api_filter is None:
# Default: only non-deprecated v1 APIs # Default: only non-deprecated APIs
return not webmethod.deprecated and webmethod.level == LLAMA_STACK_API_V1 return not webmethod.deprecated
elif api_filter == "deprecated": elif api_filter == "deprecated":
# Special filter: show deprecated routes regardless of their actual level # Special filter: show deprecated routes regardless of their actual level
return bool(webmethod.deprecated) return bool(webmethod.deprecated)

View file

@ -18,14 +18,21 @@ from typing import Any, TypeVar, Union, get_args, get_origin
import httpx import httpx
import yaml import yaml
from fastapi import Response as FastAPIResponse from fastapi import Response as FastAPIResponse
from llama_stack_client import (
NOT_GIVEN, try:
APIResponse, from llama_stack_client import (
AsyncAPIResponse, NOT_GIVEN,
AsyncLlamaStackClient, APIResponse,
AsyncStream, AsyncAPIResponse,
LlamaStackClient, AsyncLlamaStackClient,
) AsyncStream,
LlamaStackClient,
)
except ImportError as e:
raise ImportError(
"llama-stack-client is not installed. Please install it with `uv pip install llama-stack[client]`."
) from e
from pydantic import BaseModel, TypeAdapter from pydantic import BaseModel, TypeAdapter
from rich.console import Console from rich.console import Console
from termcolor import cprint from termcolor import cprint
@ -382,6 +389,12 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls) matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
body |= path_params body |= path_params
# Pass through params that aren't already handled as path params
if options.params:
extra_query_params = {k: v for k, v in options.params.items() if k not in path_params}
if extra_query_params:
body["extra_query"] = extra_query_params
body, field_names = self._handle_file_uploads(options, body) body, field_names = self._handle_file_uploads(options, body)
body = self._convert_body(matched_func, body, exclude_params=set(field_names)) body = self._convert_body(matched_func, body, exclude_params=set(field_names))

View file

@ -397,6 +397,18 @@ async def instantiate_provider(
impl.__provider_spec__ = provider_spec impl.__provider_spec__ = provider_spec
impl.__provider_config__ = config impl.__provider_config__ = config
# Apply tracing if telemetry is enabled and any base class has __marked_for_tracing__ marker
if run_config.telemetry.enabled:
traced_classes = [
base for base in reversed(impl.__class__.__mro__) if getattr(base, "__marked_for_tracing__", False)
]
if traced_classes:
from llama_stack.core.telemetry.trace_protocol import trace_protocol
for cls in traced_classes:
trace_protocol(cls)
protocols = api_protocol_map_for_compliance_check(run_config) protocols = api_protocol_map_for_compliance_check(run_config)
additional_protocols = additional_protocols_map() additional_protocols = additional_protocols_map()
# TODO: check compliance for special tool groups # TODO: check compliance for special tool groups

View file

@ -45,6 +45,7 @@ async def get_routing_table_impl(
raise ValueError(f"API {api.value} not found in router map") raise ValueError(f"API {api.value} not found in router map")
impl = api_to_tables[api.value](impls_by_provider_id, dist_registry, policy) impl = api_to_tables[api.value](impls_by_provider_id, dist_registry, policy)
await impl.initialize() await impl.initialize()
return impl return impl
@ -92,5 +93,6 @@ async def get_auto_router_impl(
api_to_dep_impl["safety_config"] = run_config.safety api_to_dep_impl["safety_config"] = run_config.safety
impl = api_to_routers[api.value](routing_table, **api_to_dep_impl) impl = api_to_routers[api.value](routing_table, **api_to_dep_impl)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -190,7 +190,7 @@ class InferenceRouter(Inference):
response = await provider.openai_completion(params) response = await provider.openai_completion(params)
response.model = request_model_id response.model = request_model_id
if self.telemetry_enabled: if self.telemetry_enabled and response.usage is not None:
metrics = self._construct_metrics( metrics = self._construct_metrics(
prompt_tokens=response.usage.prompt_tokens, prompt_tokens=response.usage.prompt_tokens,
completion_tokens=response.usage.completion_tokens, completion_tokens=response.usage.completion_tokens,
@ -253,7 +253,7 @@ class InferenceRouter(Inference):
if self.store: if self.store:
asyncio.create_task(self.store.store_chat_completion(response, params.messages)) asyncio.create_task(self.store.store_chat_completion(response, params.messages))
if self.telemetry_enabled: if self.telemetry_enabled and response.usage is not None:
metrics = self._construct_metrics( metrics = self._construct_metrics(
prompt_tokens=response.usage.prompt_tokens, prompt_tokens=response.usage.prompt_tokens,
completion_tokens=response.usage.completion_tokens, completion_tokens=response.usage.completion_tokens,

View file

@ -6,7 +6,7 @@
from typing import Any from typing import Any
from llama_stack.apis.inference import Message from llama_stack.apis.inference import OpenAIMessageParam
from llama_stack.apis.safety import RunShieldResponse, Safety from llama_stack.apis.safety import RunShieldResponse, Safety
from llama_stack.apis.safety.safety import ModerationObject from llama_stack.apis.safety.safety import ModerationObject
from llama_stack.apis.shields import Shield from llama_stack.apis.shields import Shield
@ -52,7 +52,7 @@ class SafetyRouter(Safety):
async def run_shield( async def run_shield(
self, self,
shield_id: str, shield_id: str,
messages: list[Message], messages: list[OpenAIMessageParam],
params: dict[str, Any] = None, params: dict[str, Any] = None,
) -> RunShieldResponse: ) -> RunShieldResponse:
logger.debug(f"SafetyRouter.run_shield: {shield_id}") logger.debug(f"SafetyRouter.run_shield: {shield_id}")

View file

@ -8,14 +8,9 @@ from typing import Any
from llama_stack.apis.common.content_types import ( from llama_stack.apis.common.content_types import (
URL, URL,
InterleavedContent,
) )
from llama_stack.apis.tools import ( from llama_stack.apis.tools import (
ListToolDefsResponse, ListToolDefsResponse,
RAGDocument,
RAGQueryConfig,
RAGQueryResult,
RAGToolRuntime,
ToolRuntime, ToolRuntime,
) )
from llama_stack.log import get_logger from llama_stack.log import get_logger
@ -26,36 +21,6 @@ logger = get_logger(name=__name__, category="core::routers")
class ToolRuntimeRouter(ToolRuntime): class ToolRuntimeRouter(ToolRuntime):
class RagToolImpl(RAGToolRuntime):
def __init__(
self,
routing_table: ToolGroupsRoutingTable,
) -> None:
logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
self.routing_table = routing_table
async def query(
self,
content: InterleavedContent,
vector_store_ids: list[str],
query_config: RAGQueryConfig | None = None,
) -> RAGQueryResult:
logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_store_ids}")
provider = await self.routing_table.get_provider_impl("knowledge_search")
return await provider.query(content, vector_store_ids, query_config)
async def insert(
self,
documents: list[RAGDocument],
vector_store_id: str,
chunk_size_in_tokens: int = 512,
) -> None:
logger.debug(
f"ToolRuntimeRouter.RagToolImpl.insert: {vector_store_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
)
provider = await self.routing_table.get_provider_impl("insert_into_memory")
return await provider.insert(documents, vector_store_id, chunk_size_in_tokens)
def __init__( def __init__(
self, self,
routing_table: ToolGroupsRoutingTable, routing_table: ToolGroupsRoutingTable,
@ -63,11 +28,6 @@ class ToolRuntimeRouter(ToolRuntime):
logger.debug("Initializing ToolRuntimeRouter") logger.debug("Initializing ToolRuntimeRouter")
self.routing_table = routing_table self.routing_table = routing_table
# HACK ALERT this should be in sync with "get_all_api_endpoints()"
self.rag_tool = self.RagToolImpl(routing_table)
for method in ("query", "insert"):
setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
async def initialize(self) -> None: async def initialize(self) -> None:
logger.debug("ToolRuntimeRouter.initialize") logger.debug("ToolRuntimeRouter.initialize")
pass pass

View file

@ -20,9 +20,11 @@ from llama_stack.apis.vector_io import (
SearchRankingOptions, SearchRankingOptions,
VectorIO, VectorIO,
VectorStoreChunkingStrategy, VectorStoreChunkingStrategy,
VectorStoreChunkingStrategyStatic,
VectorStoreChunkingStrategyStaticConfig,
VectorStoreDeleteResponse, VectorStoreDeleteResponse,
VectorStoreFileBatchObject, VectorStoreFileBatchObject,
VectorStoreFileContentsResponse, VectorStoreFileContentResponse,
VectorStoreFileDeleteResponse, VectorStoreFileDeleteResponse,
VectorStoreFileObject, VectorStoreFileObject,
VectorStoreFilesListInBatchResponse, VectorStoreFilesListInBatchResponse,
@ -167,6 +169,13 @@ class VectorIORouter(VectorIO):
if embedding_dimension is not None: if embedding_dimension is not None:
params.model_extra["embedding_dimension"] = embedding_dimension params.model_extra["embedding_dimension"] = embedding_dimension
# Set chunking strategy explicitly if not provided
if params.chunking_strategy is None or params.chunking_strategy.type == "auto":
# actualize the chunking strategy to static
params.chunking_strategy = VectorStoreChunkingStrategyStatic(
static=VectorStoreChunkingStrategyStaticConfig()
)
return await provider.openai_create_vector_store(params) return await provider.openai_create_vector_store(params)
async def openai_list_vector_stores( async def openai_list_vector_stores(
@ -238,6 +247,13 @@ class VectorIORouter(VectorIO):
metadata: dict[str, Any] | None = None, metadata: dict[str, Any] | None = None,
) -> VectorStoreObject: ) -> VectorStoreObject:
logger.debug(f"VectorIORouter.openai_update_vector_store: {vector_store_id}") logger.debug(f"VectorIORouter.openai_update_vector_store: {vector_store_id}")
# Check if provider_id is being changed (not supported)
if metadata and "provider_id" in metadata:
current_store = await self.routing_table.get_object_by_identifier("vector_store", vector_store_id)
if current_store and current_store.provider_id != metadata["provider_id"]:
raise ValueError("provider_id cannot be changed after vector store creation")
provider = await self.routing_table.get_provider_impl(vector_store_id) provider = await self.routing_table.get_provider_impl(vector_store_id)
return await provider.openai_update_vector_store( return await provider.openai_update_vector_store(
vector_store_id=vector_store_id, vector_store_id=vector_store_id,
@ -283,6 +299,8 @@ class VectorIORouter(VectorIO):
chunking_strategy: VectorStoreChunkingStrategy | None = None, chunking_strategy: VectorStoreChunkingStrategy | None = None,
) -> VectorStoreFileObject: ) -> VectorStoreFileObject:
logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}") logger.debug(f"VectorIORouter.openai_attach_file_to_vector_store: {vector_store_id}, {file_id}")
if chunking_strategy is None or chunking_strategy.type == "auto":
chunking_strategy = VectorStoreChunkingStrategyStatic(static=VectorStoreChunkingStrategyStaticConfig())
provider = await self.routing_table.get_provider_impl(vector_store_id) provider = await self.routing_table.get_provider_impl(vector_store_id)
return await provider.openai_attach_file_to_vector_store( return await provider.openai_attach_file_to_vector_store(
vector_store_id=vector_store_id, vector_store_id=vector_store_id,
@ -327,12 +345,19 @@ class VectorIORouter(VectorIO):
self, self,
vector_store_id: str, vector_store_id: str,
file_id: str, file_id: str,
) -> VectorStoreFileContentsResponse: include_embeddings: bool | None = False,
logger.debug(f"VectorIORouter.openai_retrieve_vector_store_file_contents: {vector_store_id}, {file_id}") include_metadata: bool | None = False,
provider = await self.routing_table.get_provider_impl(vector_store_id) ) -> VectorStoreFileContentResponse:
return await provider.openai_retrieve_vector_store_file_contents( logger.debug(
f"VectorIORouter.openai_retrieve_vector_store_file_contents: {vector_store_id}, {file_id}, "
f"include_embeddings={include_embeddings}, include_metadata={include_metadata}"
)
return await self.routing_table.openai_retrieve_vector_store_file_contents(
vector_store_id=vector_store_id, vector_store_id=vector_store_id,
file_id=file_id, file_id=file_id,
include_embeddings=include_embeddings,
include_metadata=include_metadata,
) )
async def openai_update_vector_store_file( async def openai_update_vector_store_file(

View file

@ -15,7 +15,7 @@ from llama_stack.apis.vector_io.vector_io import (
SearchRankingOptions, SearchRankingOptions,
VectorStoreChunkingStrategy, VectorStoreChunkingStrategy,
VectorStoreDeleteResponse, VectorStoreDeleteResponse,
VectorStoreFileContentsResponse, VectorStoreFileContentResponse,
VectorStoreFileDeleteResponse, VectorStoreFileDeleteResponse,
VectorStoreFileObject, VectorStoreFileObject,
VectorStoreFileStatus, VectorStoreFileStatus,
@ -195,12 +195,17 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
self, self,
vector_store_id: str, vector_store_id: str,
file_id: str, file_id: str,
) -> VectorStoreFileContentsResponse: include_embeddings: bool | None = False,
include_metadata: bool | None = False,
) -> VectorStoreFileContentResponse:
await self.assert_action_allowed("read", "vector_store", vector_store_id) await self.assert_action_allowed("read", "vector_store", vector_store_id)
provider = await self.get_provider_impl(vector_store_id) provider = await self.get_provider_impl(vector_store_id)
return await provider.openai_retrieve_vector_store_file_contents( return await provider.openai_retrieve_vector_store_file_contents(
vector_store_id=vector_store_id, vector_store_id=vector_store_id,
file_id=file_id, file_id=file_id,
include_embeddings=include_embeddings,
include_metadata=include_metadata,
) )
async def openai_update_vector_store_file( async def openai_update_vector_store_file(

View file

@ -13,7 +13,6 @@ from aiohttp import hdrs
from starlette.routing import Route from starlette.routing import Route
from llama_stack.apis.datatypes import Api, ExternalApiSpec from llama_stack.apis.datatypes import Api, ExternalApiSpec
from llama_stack.apis.tools import RAGToolRuntime, SpecialToolGroup
from llama_stack.core.resolver import api_protocol_map from llama_stack.core.resolver import api_protocol_map
from llama_stack.schema_utils import WebMethod from llama_stack.schema_utils import WebMethod
@ -25,33 +24,16 @@ RouteImpls = dict[str, PathImpl]
RouteMatch = tuple[EndpointFunc, PathParams, str, WebMethod] RouteMatch = tuple[EndpointFunc, PathParams, str, WebMethod]
def toolgroup_protocol_map():
return {
SpecialToolGroup.rag_tool: RAGToolRuntime,
}
def get_all_api_routes( def get_all_api_routes(
external_apis: dict[Api, ExternalApiSpec] | None = None, external_apis: dict[Api, ExternalApiSpec] | None = None,
) -> dict[Api, list[tuple[Route, WebMethod]]]: ) -> dict[Api, list[tuple[Route, WebMethod]]]:
apis = {} apis = {}
protocols = api_protocol_map(external_apis) protocols = api_protocol_map(external_apis)
toolgroup_protocols = toolgroup_protocol_map()
for api, protocol in protocols.items(): for api, protocol in protocols.items():
routes = [] routes = []
protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction) protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
# HACK ALERT
if api == Api.tool_runtime:
for tool_group in SpecialToolGroup:
sub_protocol = toolgroup_protocols[tool_group]
sub_protocol_methods = inspect.getmembers(sub_protocol, predicate=inspect.isfunction)
for name, method in sub_protocol_methods:
if not hasattr(method, "__webmethod__"):
continue
protocol_methods.append((f"{tool_group.value}.{name}", method))
for name, method in protocol_methods: for name, method in protocol_methods:
# Get all webmethods for this method (supports multiple decorators) # Get all webmethods for this method (supports multiple decorators)
webmethods = getattr(method, "__webmethods__", []) webmethods = getattr(method, "__webmethods__", [])

Some files were not shown because too many files have changed in this diff Show more