Signed-off-by: Bill Murdock <bmurdock@redhat.com>
This commit is contained in:
Bill Murdock 2025-10-06 16:19:57 -04:00
commit e77b7a127c
854 changed files with 165238 additions and 99099 deletions

2
.github/CODEOWNERS vendored
View file

@ -2,4 +2,4 @@
# These owners will be the default owners for everything in # These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence, # the repo. Unless a later match takes precedence,
* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 * @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo

1
.github/TRIAGERS.md vendored
View file

@ -1,2 +1 @@
# This file documents Triage members in the Llama Stack community # This file documents Triage members in the Llama Stack community
@franciscojavierarceo

View file

@ -12,6 +12,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
| Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode | | Integration Tests (Replay) | [integration-tests.yml](integration-tests.yml) | Run the integration test suites from tests/integration in replay mode |
| Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers | | Vector IO Integration Tests | [integration-vector-io-tests.yml](integration-vector-io-tests.yml) | Run the integration test suite with various VectorIO providers |
| Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks | | Pre-commit | [pre-commit.yml](pre-commit.yml) | Run pre-commit checks |
| Pre-commit Bot | [precommit-trigger.yml](precommit-trigger.yml) | Pre-commit bot for PR |
| Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build | | Test Llama Stack Build | [providers-build.yml](providers-build.yml) | Test llama stack build |
| Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project | | Python Package Build Test | [python-build-test.yml](python-build-test.yml) | Test building the llama-stack PyPI project |
| Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration | | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Run the integration test suite from tests/integration |

View file

@ -42,18 +42,27 @@ jobs:
run-replay-mode-tests: run-replay-mode-tests:
runs-on: ubuntu-latest runs-on: ubuntu-latest
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }} name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
client-type: [library, server] client-type: [library, server]
# Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
# Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12 # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }} python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }} client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
suite: [base, vision] # Define (setup, suite) pairs - they are always matched and cannot be independent
# Weekly schedule (Sun 1 AM): vllm+base
# Input test-setup=ollama-vision: ollama-vision+vision
# Default (including test-setup=ollama): both ollama+base and ollama-vision+vision
config: >-
${{
github.event.schedule == '1 0 * * 0'
&& fromJSON('[{"setup": "vllm", "suite": "base"}]')
|| github.event.inputs.test-setup == 'ollama-vision'
&& fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
|| fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}]')
}}
steps: steps:
- name: Checkout repository - name: Checkout repository
@ -64,14 +73,14 @@ jobs:
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
client-version: ${{ matrix.client-version }} client-version: ${{ matrix.client-version }}
setup: ${{ matrix.setup }} setup: ${{ matrix.config.setup }}
suite: ${{ matrix.suite }} suite: ${{ matrix.config.suite }}
inference-mode: 'replay' inference-mode: 'replay'
- name: Run tests - name: Run tests
uses: ./.github/actions/run-and-record-tests uses: ./.github/actions/run-and-record-tests
with: with:
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }} stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
setup: ${{ matrix.setup }} setup: ${{ matrix.config.setup }}
inference-mode: 'replay' inference-mode: 'replay'
suite: ${{ matrix.suite }} suite: ${{ matrix.config.suite }}

227
.github/workflows/precommit-trigger.yml vendored Normal file
View file

@ -0,0 +1,227 @@
name: Pre-commit Bot
run-name: Pre-commit bot for PR #${{ github.event.issue.number }}
on:
issue_comment:
types: [created]
jobs:
pre-commit:
# Only run on pull request comments
if: github.event.issue.pull_request && contains(github.event.comment.body, '@github-actions run precommit')
runs-on: ubuntu-latest
permissions:
contents: write
pull-requests: write
steps:
- name: Check comment author and get PR details
id: check_author
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
// Get PR details
const pr = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: context.issue.number
});
// Check if commenter has write access or is the PR author
const commenter = context.payload.comment.user.login;
const prAuthor = pr.data.user.login;
let hasPermission = false;
// Check if commenter is PR author
if (commenter === prAuthor) {
hasPermission = true;
console.log(`Comment author ${commenter} is the PR author`);
} else {
// Check if commenter has write/admin access
try {
const permission = await github.rest.repos.getCollaboratorPermissionLevel({
owner: context.repo.owner,
repo: context.repo.repo,
username: commenter
});
const level = permission.data.permission;
hasPermission = ['write', 'admin', 'maintain'].includes(level);
console.log(`Comment author ${commenter} has permission: ${level}`);
} catch (error) {
console.log(`Could not check permissions for ${commenter}: ${error.message}`);
}
}
if (!hasPermission) {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: `❌ @${commenter} You don't have permission to trigger pre-commit. Only PR authors or repository collaborators can run this command.`
});
core.setFailed(`User ${commenter} does not have permission`);
return;
}
// Save PR info for later steps
core.setOutput('pr_number', context.issue.number);
core.setOutput('pr_head_ref', pr.data.head.ref);
core.setOutput('pr_head_sha', pr.data.head.sha);
core.setOutput('pr_head_repo', pr.data.head.repo.full_name);
core.setOutput('pr_base_ref', pr.data.base.ref);
core.setOutput('is_fork', pr.data.head.repo.full_name !== context.payload.repository.full_name);
core.setOutput('authorized', 'true');
- name: React to comment
if: steps.check_author.outputs.authorized == 'true'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
await github.rest.reactions.createForIssueComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: context.payload.comment.id,
content: 'rocket'
});
- name: Comment starting
if: steps.check_author.outputs.authorized == 'true'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: ${{ steps.check_author.outputs.pr_number }},
body: `⏳ Running pre-commit hooks on PR #${{ steps.check_author.outputs.pr_number }}...`
});
- name: Checkout PR branch (same-repo)
if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'false'
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
ref: ${{ steps.check_author.outputs.pr_head_ref }}
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}
- name: Checkout PR branch (fork)
if: steps.check_author.outputs.authorized == 'true' && steps.check_author.outputs.is_fork == 'true'
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: ${{ steps.check_author.outputs.pr_head_repo }}
ref: ${{ steps.check_author.outputs.pr_head_ref }}
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}
- name: Verify checkout
if: steps.check_author.outputs.authorized == 'true'
run: |
echo "Current SHA: $(git rev-parse HEAD)"
echo "Expected SHA: ${{ steps.check_author.outputs.pr_head_sha }}"
if [[ "$(git rev-parse HEAD)" != "${{ steps.check_author.outputs.pr_head_sha }}" ]]; then
echo "::error::Checked out SHA does not match expected SHA"
exit 1
fi
- name: Set up Python
if: steps.check_author.outputs.authorized == 'true'
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: '3.12'
cache: pip
cache-dependency-path: |
**/requirements*.txt
.pre-commit-config.yaml
- name: Set up Node.js
if: steps.check_author.outputs.authorized == 'true'
uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5.0.0
with:
node-version: '20'
cache: 'npm'
cache-dependency-path: 'llama_stack/ui/'
- name: Install npm dependencies
if: steps.check_author.outputs.authorized == 'true'
run: npm ci
working-directory: llama_stack/ui
- name: Run pre-commit
if: steps.check_author.outputs.authorized == 'true'
id: precommit
uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
continue-on-error: true
env:
SKIP: no-commit-to-branch
RUFF_OUTPUT_FORMAT: github
- name: Check for changes
if: steps.check_author.outputs.authorized == 'true'
id: changes
run: |
if ! git diff --exit-code || [ -n "$(git ls-files --others --exclude-standard)" ]; then
echo "has_changes=true" >> $GITHUB_OUTPUT
echo "Changes detected after pre-commit"
else
echo "has_changes=false" >> $GITHUB_OUTPUT
echo "No changes after pre-commit"
fi
- name: Commit and push changes
if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
run: |
git config --local user.email "github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
git add -A
git commit -m "style: apply pre-commit fixes
🤖 Applied by @github-actions bot via pre-commit workflow"
# Push changes
git push origin HEAD:${{ steps.check_author.outputs.pr_head_ref }}
- name: Comment success with changes
if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: ${{ steps.check_author.outputs.pr_number }},
body: `✅ Pre-commit hooks completed successfully!\n\n🔧 Changes have been committed and pushed to the PR branch.`
});
- name: Comment success without changes
if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'false' && steps.precommit.outcome == 'success'
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: ${{ steps.check_author.outputs.pr_number }},
body: `✅ Pre-commit hooks passed!\n\n✨ No changes needed - your code is already formatted correctly.`
});
- name: Comment failure
if: failure()
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: ${{ steps.check_author.outputs.pr_number }},
body: `❌ Pre-commit workflow failed!\n\nPlease check the [workflow logs](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}) for details.`
});

View file

@ -112,7 +112,7 @@ jobs:
fi fi
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID) entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint" echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
echo "Entrypoint is not correct" echo "Entrypoint is not correct"
exit 1 exit 1
fi fi
@ -150,7 +150,7 @@ jobs:
fi fi
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID) entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint" echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
echo "Entrypoint is not correct" echo "Entrypoint is not correct"
exit 1 exit 1
fi fi

View file

@ -24,7 +24,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@b75a909f75acd358c2196fb9a5f1299a9a8868a4 # v6.7.0 uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e # v6.8.0
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
activate-environment: true activate-environment: true

View file

@ -7,7 +7,7 @@
[![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain) [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
[![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain) [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
[**Quick Start**](https://llamastack.github.io/latest/getting_started/index.html) | [**Documentation**](https://llamastack.github.io/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack) [**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
### ✨🎉 Llama 4 Support 🎉✨ ### ✨🎉 Llama 4 Support 🎉✨

View file

@ -187,21 +187,21 @@ Configure telemetry behavior using environment variables:
- **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string) - **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`) - **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
## Visualization with Jaeger ### Quick Setup: Complete Telemetry Stack
The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector. Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):
### Starting Jaeger
Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686:
```bash ```bash
docker run --pull always --rm --name jaeger \ ./scripts/telemetry/setup_telemetry.sh
-p 16686:16686 -p 4318:4318 \
jaegertracing/jaeger:2.1.0
``` ```
Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/). This sets up:
- **Jaeger UI**: http://localhost:16686 (traces visualization)
- **Prometheus**: http://localhost:9090 (metrics)
- **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
- **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.
## Querying Metrics ## Querying Metrics

View file

@ -357,7 +357,7 @@ server:
8. Run the server: 8. Run the server:
```bash ```bash
python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml llama stack run ~/.llama/run-byoa.yaml
``` ```
9. Test the API: 9. Test the API:

View file

@ -170,7 +170,7 @@ spec:
- name: llama-stack - name: llama-stack
image: localhost/llama-stack-run-k8s:latest image: localhost/llama-stack-run-k8s:latest
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"] command: ["llama", "stack", "run", "/app/config.yaml"]
ports: ports:
- containerPort: 5000 - containerPort: 5000
volumeMounts: volumeMounts:

View file

@ -52,7 +52,7 @@ spec:
value: "${SAFETY_MODEL}" value: "${SAFETY_MODEL}"
- name: TAVILY_SEARCH_API_KEY - name: TAVILY_SEARCH_API_KEY
value: "${TAVILY_SEARCH_API_KEY}" value: "${TAVILY_SEARCH_API_KEY}"
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"] command: ["llama", "stack", "run", "/etc/config/stack_run_config.yaml", "--port", "8321"]
ports: ports:
- containerPort: 8321 - containerPort: 8321
volumeMounts: volumeMounts:

View file

@ -1,4 +1,7 @@
--- ---
description: "Files
This API is used to upload documents that can be used with other Llama Stack APIs."
sidebar_label: Files sidebar_label: Files
title: Files title: Files
--- ---
@ -7,4 +10,8 @@ title: Files
## Overview ## Overview
Files
This API is used to upload documents that can be used with other Llama Stack APIs.
This section contains documentation for all available providers for the **files** API. This section contains documentation for all available providers for the **files** API.

View file

@ -1,5 +1,7 @@
--- ---
description: "Llama Stack Inference API for generating completions, chat completions, and embeddings. description: "Inference
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported: This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions. - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
@ -12,7 +14,9 @@ title: Inference
## Overview ## Overview
Llama Stack Inference API for generating completions, chat completions, and embeddings. Inference
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported: This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions. - LLM models: these models generate "raw" and "chat" (conversational) completions.

View file

@ -15,7 +15,7 @@ Databricks inference provider for running models on Databricks' unified analytic
| Field | Type | Required | Default | Description | | Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------| |-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. | | `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `url` | `<class 'str'>` | No | | The URL for the Databricks model serving endpoint | | `url` | `str \| None` | No | | The URL for the Databricks model serving endpoint |
| `api_token` | `<class 'pydantic.types.SecretStr'>` | No | | The Databricks API token | | `api_token` | `<class 'pydantic.types.SecretStr'>` | No | | The Databricks API token |
## Sample Configuration ## Sample Configuration

View file

@ -1,4 +1,7 @@
--- ---
description: "Safety
OpenAI-compatible Moderations API."
sidebar_label: Safety sidebar_label: Safety
title: Safety title: Safety
--- ---
@ -7,4 +10,8 @@ title: Safety
## Overview ## Overview
Safety
OpenAI-compatible Moderations API.
This section contains documentation for all available providers for the **safety** API. This section contains documentation for all available providers for the **safety** API.

View file

@ -50,6 +50,7 @@ from .specification import (
Document, Document,
Example, Example,
ExampleRef, ExampleRef,
ExtraBodyParameter,
MediaType, MediaType,
Operation, Operation,
Parameter, Parameter,
@ -677,6 +678,27 @@ class Generator:
# parameters passed anywhere # parameters passed anywhere
parameters = path_parameters + query_parameters parameters = path_parameters + query_parameters
# Build extra body parameters documentation
extra_body_parameters = []
for param_name, param_type, description in op.extra_body_params:
if is_type_optional(param_type):
inner_type: type = unwrap_optional_type(param_type)
required = False
else:
inner_type = param_type
required = True
# Use description from ExtraBodyField if available, otherwise from docstring
param_description = description or doc_params.get(param_name)
extra_body_param = ExtraBodyParameter(
name=param_name,
schema=self.schema_builder.classdef_to_ref(inner_type),
description=param_description,
required=required,
)
extra_body_parameters.append(extra_body_param)
webmethod = getattr(op.func_ref, "__webmethod__", None) webmethod = getattr(op.func_ref, "__webmethod__", None)
raw_bytes_request_body = False raw_bytes_request_body = False
if webmethod: if webmethod:
@ -898,6 +920,7 @@ class Generator:
deprecated=getattr(op.webmethod, "deprecated", False) deprecated=getattr(op.webmethod, "deprecated", False)
or "DEPRECATED" in op.func_name, or "DEPRECATED" in op.func_name,
security=[] if op.public else None, security=[] if op.public else None,
extraBodyParameters=extra_body_parameters if extra_body_parameters else None,
) )
def _get_api_stability_priority(self, api_level: str) -> int: def _get_api_stability_priority(self, api_level: str) -> int:

View file

@ -19,10 +19,12 @@ from llama_stack.strong_typing.inspection import get_signature
from typing import get_origin, get_args from typing import get_origin, get_args
from fastapi import UploadFile from fastapi import UploadFile
from fastapi.params import File, Form from fastapi.params import File, Form
from typing import Annotated from typing import Annotated
from llama_stack.schema_utils import ExtraBodyField
def split_prefix( def split_prefix(
s: str, sep: str, prefix: Union[str, Iterable[str]] s: str, sep: str, prefix: Union[str, Iterable[str]]
@ -89,6 +91,7 @@ class EndpointOperation:
:param query_params: Parameters of the operation signature that are passed in the query string as `key=value` pairs. :param query_params: Parameters of the operation signature that are passed in the query string as `key=value` pairs.
:param request_params: The parameter that corresponds to the data transmitted in the request body. :param request_params: The parameter that corresponds to the data transmitted in the request body.
:param multipart_params: Parameters that indicate multipart/form-data request body. :param multipart_params: Parameters that indicate multipart/form-data request body.
:param extra_body_params: Parameters that arrive via extra_body and are documented but not in SDK.
:param event_type: The Python type of the data that is transmitted out-of-band (e.g. via websockets) while the operation is in progress. :param event_type: The Python type of the data that is transmitted out-of-band (e.g. via websockets) while the operation is in progress.
:param response_type: The Python type of the data that is transmitted in the response body. :param response_type: The Python type of the data that is transmitted in the response body.
:param http_method: The HTTP method used to invoke the endpoint such as POST, GET or PUT. :param http_method: The HTTP method used to invoke the endpoint such as POST, GET or PUT.
@ -106,6 +109,7 @@ class EndpointOperation:
query_params: List[OperationParameter] query_params: List[OperationParameter]
request_params: Optional[OperationParameter] request_params: Optional[OperationParameter]
multipart_params: List[OperationParameter] multipart_params: List[OperationParameter]
extra_body_params: List[tuple[str, type, str | None]]
event_type: Optional[type] event_type: Optional[type]
response_type: type response_type: type
http_method: HTTPMethod http_method: HTTPMethod
@ -265,6 +269,7 @@ def get_endpoint_operations(
query_params = [] query_params = []
request_params = [] request_params = []
multipart_params = [] multipart_params = []
extra_body_params = []
for param_name, parameter in signature.parameters.items(): for param_name, parameter in signature.parameters.items():
param_type = _get_annotation_type(parameter.annotation, func_ref) param_type = _get_annotation_type(parameter.annotation, func_ref)
@ -279,6 +284,13 @@ def get_endpoint_operations(
f"parameter '{param_name}' in function '{func_name}' has no type annotation" f"parameter '{param_name}' in function '{func_name}' has no type annotation"
) )
# Check if this is an extra_body parameter
is_extra_body, extra_body_desc = _is_extra_body_param(param_type)
if is_extra_body:
# Store in a separate list for documentation
extra_body_params.append((param_name, param_type, extra_body_desc))
continue # Skip adding to request_params
is_multipart = _is_multipart_param(param_type) is_multipart = _is_multipart_param(param_type)
if prefix in ["get", "delete"]: if prefix in ["get", "delete"]:
@ -351,6 +363,7 @@ def get_endpoint_operations(
query_params=query_params, query_params=query_params,
request_params=request_params, request_params=request_params,
multipart_params=multipart_params, multipart_params=multipart_params,
extra_body_params=extra_body_params,
event_type=event_type, event_type=event_type,
response_type=response_type, response_type=response_type,
http_method=http_method, http_method=http_method,
@ -403,7 +416,7 @@ def get_endpoint_events(endpoint: type) -> Dict[str, type]:
def _is_multipart_param(param_type: type) -> bool: def _is_multipart_param(param_type: type) -> bool:
""" """
Check if a parameter type indicates multipart form data. Check if a parameter type indicates multipart form data.
Returns True if the type is: Returns True if the type is:
- UploadFile - UploadFile
- Annotated[UploadFile, File()] - Annotated[UploadFile, File()]
@ -413,19 +426,38 @@ def _is_multipart_param(param_type: type) -> bool:
""" """
if param_type is UploadFile: if param_type is UploadFile:
return True return True
# Check for Annotated types # Check for Annotated types
origin = get_origin(param_type) origin = get_origin(param_type)
if origin is None: if origin is None:
return False return False
if origin is Annotated: if origin is Annotated:
args = get_args(param_type) args = get_args(param_type)
if len(args) < 2: if len(args) < 2:
return False return False
# Check the annotations for File() or Form() # Check the annotations for File() or Form()
for annotation in args[1:]: for annotation in args[1:]:
if isinstance(annotation, (File, Form)): if isinstance(annotation, (File, Form)):
return True return True
return False return False
def _is_extra_body_param(param_type: type) -> tuple[bool, str | None]:
"""
Check if parameter is marked as coming from extra_body.
Returns:
(is_extra_body, description): Tuple of boolean and optional description
"""
origin = get_origin(param_type)
if origin is Annotated:
args = get_args(param_type)
for annotation in args[1:]:
if isinstance(annotation, ExtraBodyField):
return True, annotation.description
# Also check by type name for cases where import matters
if type(annotation).__name__ == 'ExtraBodyField':
return True, getattr(annotation, 'description', None)
return False, None

View file

@ -106,6 +106,15 @@ class Parameter:
example: Optional[Any] = None example: Optional[Any] = None
@dataclass
class ExtraBodyParameter:
"""Represents a parameter that arrives via extra_body in the request."""
name: str
schema: SchemaOrRef
description: Optional[str] = None
required: Optional[bool] = None
@dataclass @dataclass
class Operation: class Operation:
responses: Dict[str, Union[Response, ResponseRef]] responses: Dict[str, Union[Response, ResponseRef]]
@ -118,6 +127,7 @@ class Operation:
callbacks: Optional[Dict[str, "Callback"]] = None callbacks: Optional[Dict[str, "Callback"]] = None
security: Optional[List["SecurityRequirement"]] = None security: Optional[List["SecurityRequirement"]] = None
deprecated: Optional[bool] = None deprecated: Optional[bool] = None
extraBodyParameters: Optional[List[ExtraBodyParameter]] = None
@dataclass @dataclass

View file

@ -52,6 +52,17 @@ class Specification:
if display_name: if display_name:
tag["x-displayName"] = display_name tag["x-displayName"] = display_name
# Handle operations to rename extraBodyParameters -> x-llama-stack-extra-body-params
paths = json_doc.get("paths", {})
for path_item in paths.values():
if isinstance(path_item, dict):
for method in ["get", "post", "put", "delete", "patch"]:
operation = path_item.get(method)
if operation and isinstance(operation, dict):
extra_body_params = operation.pop("extraBodyParameters", None)
if extra_body_params:
operation["x-llama-stack-extra-body-params"] = extra_body_params
return json_doc return json_doc
def get_json_string(self, pretty_print: bool = False) -> str: def get_json_string(self, pretty_print: bool = False) -> str:

View file

@ -1443,8 +1443,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "List all chat completions.", "summary": "List chat completions.",
"description": "List all chat completions.", "description": "List chat completions.",
"parameters": [ "parameters": [
{ {
"name": "after", "name": "after",
@ -1520,8 +1520,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.", "summary": "Create chat completions.",
"description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.", "description": "Create chat completions.\nGenerate an OpenAI-compatible chat completion for the given messages using the specified model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1565,8 +1565,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Describe a chat completion by its ID.", "summary": "Get chat completion.",
"description": "Describe a chat completion by its ID.", "description": "Get chat completion.\nDescribe a chat completion by its ID.",
"parameters": [ "parameters": [
{ {
"name": "completion_id", "name": "completion_id",
@ -1610,8 +1610,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Generate an OpenAI-compatible completion for the given prompt using the specified model.", "summary": "Create completion.",
"description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.", "description": "Create completion.\nGenerate an OpenAI-compatible completion for the given prompt using the specified model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1655,8 +1655,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Generate OpenAI-compatible embeddings for the given input using the specified model.", "summary": "Create embeddings.",
"description": "Generate OpenAI-compatible embeddings for the given input using the specified model.", "description": "Create embeddings.\nGenerate OpenAI-compatible embeddings for the given input using the specified model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1700,8 +1700,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Returns a list of files that belong to the user's organization.", "summary": "List files.",
"description": "Returns a list of files that belong to the user's organization.", "description": "List files.\nReturns a list of files that belong to the user's organization.",
"parameters": [ "parameters": [
{ {
"name": "after", "name": "after",
@ -1770,8 +1770,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Upload a file that can be used across various endpoints.", "summary": "Upload file.",
"description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.", "description": "Upload file.\nUpload a file that can be used across various endpoints.\n\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1831,8 +1831,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Returns information about a specific file.", "summary": "Retrieve file.",
"description": "Returns information about a specific file.", "description": "Retrieve file.\nReturns information about a specific file.",
"parameters": [ "parameters": [
{ {
"name": "file_id", "name": "file_id",
@ -1874,8 +1874,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Delete a file.", "summary": "Delete file.",
"description": "Delete a file.", "description": "Delete file.",
"parameters": [ "parameters": [
{ {
"name": "file_id", "name": "file_id",
@ -1919,8 +1919,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Returns the contents of the specified file.", "summary": "Retrieve file content.",
"description": "Returns the contents of the specified file.", "description": "Retrieve file content.\nReturns the contents of the specified file.",
"parameters": [ "parameters": [
{ {
"name": "file_id", "name": "file_id",
@ -1999,8 +1999,8 @@
"tags": [ "tags": [
"Safety" "Safety"
], ],
"summary": "Classifies if text and/or image inputs are potentially harmful.", "summary": "Create moderation.",
"description": "Classifies if text and/or image inputs are potentially harmful.", "description": "Create moderation.\nClassifies if text and/or image inputs are potentially harmful.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -2044,8 +2044,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "List all OpenAI responses.", "summary": "List all responses.",
"description": "List all OpenAI responses.", "description": "List all responses.",
"parameters": [ "parameters": [
{ {
"name": "after", "name": "after",
@ -2119,8 +2119,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "Create a new OpenAI response.", "summary": "Create a model response.",
"description": "Create a new OpenAI response.", "description": "Create a model response.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -2132,7 +2132,27 @@
}, },
"required": true "required": true
}, },
"deprecated": true "deprecated": true,
"x-llama-stack-extra-body-params": [
{
"name": "shields",
"schema": {
"type": "array",
"items": {
"oneOf": [
{
"type": "string"
},
{
"$ref": "#/components/schemas/ResponseShieldSpec"
}
]
}
},
"description": "List of shields to apply during response generation. Shields provide safety and content moderation.",
"required": false
}
]
} }
}, },
"/v1/openai/v1/responses/{response_id}": { "/v1/openai/v1/responses/{response_id}": {
@ -2164,8 +2184,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "Retrieve an OpenAI response by its ID.", "summary": "Get a model response.",
"description": "Retrieve an OpenAI response by its ID.", "description": "Get a model response.",
"parameters": [ "parameters": [
{ {
"name": "response_id", "name": "response_id",
@ -2207,8 +2227,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "Delete an OpenAI response by its ID.", "summary": "Delete a response.",
"description": "Delete an OpenAI response by its ID.", "description": "Delete a response.",
"parameters": [ "parameters": [
{ {
"name": "response_id", "name": "response_id",
@ -2252,8 +2272,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "List input items for a given OpenAI response.", "summary": "List input items.",
"description": "List input items for a given OpenAI response.", "description": "List input items.",
"parameters": [ "parameters": [
{ {
"name": "response_id", "name": "response_id",
@ -9521,6 +9541,21 @@
"title": "OpenAIResponseText", "title": "OpenAIResponseText",
"description": "Text response configuration for OpenAI responses." "description": "Text response configuration for OpenAI responses."
}, },
"ResponseShieldSpec": {
"type": "object",
"properties": {
"type": {
"type": "string",
"description": "The type/identifier of the shield."
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "ResponseShieldSpec",
"description": "Specification for a shield to apply during response generation."
},
"OpenAIResponseInputTool": { "OpenAIResponseInputTool": {
"oneOf": [ "oneOf": [
{ {
@ -13331,12 +13366,13 @@
}, },
{ {
"name": "Files", "name": "Files",
"description": "" "description": "This API is used to upload documents that can be used with other Llama Stack APIs.",
"x-displayName": "Files"
}, },
{ {
"name": "Inference", "name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
"x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings." "x-displayName": "Inference"
}, },
{ {
"name": "Models", "name": "Models",
@ -13348,7 +13384,8 @@
}, },
{ {
"name": "Safety", "name": "Safety",
"description": "" "description": "OpenAI-compatible Moderations API.",
"x-displayName": "Safety"
}, },
{ {
"name": "Telemetry", "name": "Telemetry",

View file

@ -1033,8 +1033,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: List all chat completions. summary: List chat completions.
description: List all chat completions. description: List chat completions.
parameters: parameters:
- name: after - name: after
in: query in: query
@ -1087,10 +1087,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: >- summary: Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
description: >- description: >-
Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using Generate an OpenAI-compatible chat completion for the given messages using
the specified model. the specified model.
parameters: [] parameters: []
@ -1122,8 +1122,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: Describe a chat completion by its ID. summary: Get chat completion.
description: Describe a chat completion by its ID. description: >-
Get chat completion.
Describe a chat completion by its ID.
parameters: parameters:
- name: completion_id - name: completion_id
in: path in: path
@ -1153,10 +1156,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: >- summary: Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
description: >- description: >-
Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified Generate an OpenAI-compatible completion for the given prompt using the specified
model. model.
parameters: [] parameters: []
@ -1189,10 +1192,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: >- summary: Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified
model.
description: >- description: >-
Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified Generate OpenAI-compatible embeddings for the given input using the specified
model. model.
parameters: [] parameters: []
@ -1225,9 +1228,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: List files.
Returns a list of files that belong to the user's organization.
description: >- description: >-
List files.
Returns a list of files that belong to the user's organization. Returns a list of files that belong to the user's organization.
parameters: parameters:
- name: after - name: after
@ -1285,11 +1289,13 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: Upload file.
Upload a file that can be used across various endpoints.
description: >- description: >-
Upload file.
Upload a file that can be used across various endpoints. Upload a file that can be used across various endpoints.
The file upload should be a multipart form request with: The file upload should be a multipart form request with:
- file: The File object (not file name) to be uploaded. - file: The File object (not file name) to be uploaded.
@ -1338,9 +1344,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: Retrieve file.
Returns information about a specific file.
description: >- description: >-
Retrieve file.
Returns information about a specific file. Returns information about a specific file.
parameters: parameters:
- name: file_id - name: file_id
@ -1372,8 +1379,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: Delete a file. summary: Delete file.
description: Delete a file. description: Delete file.
parameters: parameters:
- name: file_id - name: file_id
in: path in: path
@ -1405,9 +1412,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: Retrieve file content.
Returns the contents of the specified file.
description: >- description: >-
Retrieve file content.
Returns the contents of the specified file. Returns the contents of the specified file.
parameters: parameters:
- name: file_id - name: file_id
@ -1464,9 +1472,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Safety - Safety
summary: >- summary: Create moderation.
Classifies if text and/or image inputs are potentially harmful.
description: >- description: >-
Create moderation.
Classifies if text and/or image inputs are potentially harmful. Classifies if text and/or image inputs are potentially harmful.
parameters: [] parameters: []
requestBody: requestBody:
@ -1497,8 +1506,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: List all OpenAI responses. summary: List all responses.
description: List all OpenAI responses. description: List all responses.
parameters: parameters:
- name: after - name: after
in: query in: query
@ -1549,8 +1558,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: Create a new OpenAI response. summary: Create a model response.
description: Create a new OpenAI response. description: Create a model response.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -1559,6 +1568,18 @@ paths:
$ref: '#/components/schemas/CreateOpenaiResponseRequest' $ref: '#/components/schemas/CreateOpenaiResponseRequest'
required: true required: true
deprecated: true deprecated: true
x-llama-stack-extra-body-params:
- name: shields
schema:
type: array
items:
oneOf:
- type: string
- $ref: '#/components/schemas/ResponseShieldSpec'
description: >-
List of shields to apply during response generation. Shields provide safety
and content moderation.
required: false
/v1/openai/v1/responses/{response_id}: /v1/openai/v1/responses/{response_id}:
get: get:
responses: responses:
@ -1580,8 +1601,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: Retrieve an OpenAI response by its ID. summary: Get a model response.
description: Retrieve an OpenAI response by its ID. description: Get a model response.
parameters: parameters:
- name: response_id - name: response_id
in: path in: path
@ -1611,8 +1632,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: Delete an OpenAI response by its ID. summary: Delete a response.
description: Delete an OpenAI response by its ID. description: Delete a response.
parameters: parameters:
- name: response_id - name: response_id
in: path in: path
@ -1642,10 +1663,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: >- summary: List input items.
List input items for a given OpenAI response. description: List input items.
description: >-
List input items for a given OpenAI response.
parameters: parameters:
- name: response_id - name: response_id
in: path in: path
@ -7076,6 +7095,18 @@ components:
title: OpenAIResponseText title: OpenAIResponseText
description: >- description: >-
Text response configuration for OpenAI responses. Text response configuration for OpenAI responses.
ResponseShieldSpec:
type: object
properties:
type:
type: string
description: The type/identifier of the shield.
additionalProperties: false
required:
- type
title: ResponseShieldSpec
description: >-
Specification for a shield to apply during response generation.
OpenAIResponseInputTool: OpenAIResponseInputTool:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch' - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -9987,9 +10018,16 @@ tags:
x-displayName: >- x-displayName: >-
Llama Stack Evaluation API for running evaluations on model and agent candidates. Llama Stack Evaluation API for running evaluations on model and agent candidates.
- name: Files - name: Files
description: '' description: >-
This API is used to upload documents that can be used with other Llama Stack
APIs.
x-displayName: Files
- name: Inference - name: Inference
description: >- description: >-
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
This API provides the raw interface to the underlying models. Two kinds of models This API provides the raw interface to the underlying models. Two kinds of models
are supported: are supported:
@ -9997,15 +10035,14 @@ tags:
- Embedding models: these models generate embeddings to be used for semantic - Embedding models: these models generate embeddings to be used for semantic
search. search.
x-displayName: >- x-displayName: Inference
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
- name: Models - name: Models
description: '' description: ''
- name: PostTraining (Coming Soon) - name: PostTraining (Coming Soon)
description: '' description: ''
- name: Safety - name: Safety
description: '' description: OpenAI-compatible Moderations API.
x-displayName: Safety
- name: Telemetry - name: Telemetry
description: '' description: ''
- name: VectorIO - name: VectorIO

View file

@ -69,8 +69,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "List all chat completions.", "summary": "List chat completions.",
"description": "List all chat completions.", "description": "List chat completions.",
"parameters": [ "parameters": [
{ {
"name": "after", "name": "after",
@ -146,8 +146,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.", "summary": "Create chat completions.",
"description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.", "description": "Create chat completions.\nGenerate an OpenAI-compatible chat completion for the given messages using the specified model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -191,8 +191,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Describe a chat completion by its ID.", "summary": "Get chat completion.",
"description": "Describe a chat completion by its ID.", "description": "Get chat completion.\nDescribe a chat completion by its ID.",
"parameters": [ "parameters": [
{ {
"name": "completion_id", "name": "completion_id",
@ -236,8 +236,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Generate an OpenAI-compatible completion for the given prompt using the specified model.", "summary": "Create completion.",
"description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.", "description": "Create completion.\nGenerate an OpenAI-compatible completion for the given prompt using the specified model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -758,8 +758,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Generate OpenAI-compatible embeddings for the given input using the specified model.", "summary": "Create embeddings.",
"description": "Generate OpenAI-compatible embeddings for the given input using the specified model.", "description": "Create embeddings.\nGenerate OpenAI-compatible embeddings for the given input using the specified model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -803,8 +803,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Returns a list of files that belong to the user's organization.", "summary": "List files.",
"description": "Returns a list of files that belong to the user's organization.", "description": "List files.\nReturns a list of files that belong to the user's organization.",
"parameters": [ "parameters": [
{ {
"name": "after", "name": "after",
@ -873,8 +873,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Upload a file that can be used across various endpoints.", "summary": "Upload file.",
"description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.", "description": "Upload file.\nUpload a file that can be used across various endpoints.\n\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -934,8 +934,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Returns information about a specific file.", "summary": "Retrieve file.",
"description": "Returns information about a specific file.", "description": "Retrieve file.\nReturns information about a specific file.",
"parameters": [ "parameters": [
{ {
"name": "file_id", "name": "file_id",
@ -977,8 +977,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Delete a file.", "summary": "Delete file.",
"description": "Delete a file.", "description": "Delete file.",
"parameters": [ "parameters": [
{ {
"name": "file_id", "name": "file_id",
@ -1022,8 +1022,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Returns the contents of the specified file.", "summary": "Retrieve file content.",
"description": "Returns the contents of the specified file.", "description": "Retrieve file content.\nReturns the contents of the specified file.",
"parameters": [ "parameters": [
{ {
"name": "file_id", "name": "file_id",
@ -1067,8 +1067,8 @@
"tags": [ "tags": [
"Inspect" "Inspect"
], ],
"summary": "Get the current health status of the service.", "summary": "Get health status.",
"description": "Get the current health status of the service.", "description": "Get health status.\nGet the current health status of the service.",
"parameters": [], "parameters": [],
"deprecated": false "deprecated": false
} }
@ -1102,8 +1102,8 @@
"tags": [ "tags": [
"Inspect" "Inspect"
], ],
"summary": "List all available API routes with their methods and implementing providers.", "summary": "List routes.",
"description": "List all available API routes with their methods and implementing providers.", "description": "List routes.\nList all available API routes with their methods and implementing providers.",
"parameters": [], "parameters": [],
"deprecated": false "deprecated": false
} }
@ -1170,8 +1170,8 @@
"tags": [ "tags": [
"Models" "Models"
], ],
"summary": "Register a model.", "summary": "Register model.",
"description": "Register a model.", "description": "Register model.\nRegister a model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1215,8 +1215,8 @@
"tags": [ "tags": [
"Models" "Models"
], ],
"summary": "Get a model by its identifier.", "summary": "Get model.",
"description": "Get a model by its identifier.", "description": "Get model.\nGet a model by its identifier.",
"parameters": [ "parameters": [
{ {
"name": "model_id", "name": "model_id",
@ -1251,8 +1251,8 @@
"tags": [ "tags": [
"Models" "Models"
], ],
"summary": "Unregister a model.", "summary": "Unregister model.",
"description": "Unregister a model.", "description": "Unregister model.\nUnregister a model.",
"parameters": [ "parameters": [
{ {
"name": "model_id", "name": "model_id",
@ -1296,8 +1296,8 @@
"tags": [ "tags": [
"Safety" "Safety"
], ],
"summary": "Classifies if text and/or image inputs are potentially harmful.", "summary": "Create moderation.",
"description": "Classifies if text and/or image inputs are potentially harmful.", "description": "Create moderation.\nClassifies if text and/or image inputs are potentially harmful.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1374,8 +1374,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Create a new prompt.", "summary": "Create prompt.",
"description": "Create a new prompt.", "description": "Create prompt.\nCreate a new prompt.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1419,8 +1419,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Get a prompt by its identifier and optional version.", "summary": "Get prompt.",
"description": "Get a prompt by its identifier and optional version.", "description": "Get prompt.\nGet a prompt by its identifier and optional version.",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1471,8 +1471,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Update an existing prompt (increments version).", "summary": "Update prompt.",
"description": "Update an existing prompt (increments version).", "description": "Update prompt.\nUpdate an existing prompt (increments version).",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1517,8 +1517,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Delete a prompt.", "summary": "Delete prompt.",
"description": "Delete a prompt.", "description": "Delete prompt.\nDelete a prompt.",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1562,8 +1562,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Set which version of a prompt should be the default in get_prompt (latest).", "summary": "Set prompt version.",
"description": "Set which version of a prompt should be the default in get_prompt (latest).", "description": "Set prompt version.\nSet which version of a prompt should be the default in get_prompt (latest).",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1617,8 +1617,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "List all versions of a specific prompt.", "summary": "List prompt versions.",
"description": "List all versions of a specific prompt.", "description": "List prompt versions.\nList all versions of a specific prompt.",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1662,8 +1662,8 @@
"tags": [ "tags": [
"Providers" "Providers"
], ],
"summary": "List all available providers.", "summary": "List providers.",
"description": "List all available providers.", "description": "List providers.\nList all available providers.",
"parameters": [], "parameters": [],
"deprecated": false "deprecated": false
} }
@ -1697,8 +1697,8 @@
"tags": [ "tags": [
"Providers" "Providers"
], ],
"summary": "Get detailed information about a specific provider.", "summary": "Get provider.",
"description": "Get detailed information about a specific provider.", "description": "Get provider.\nGet detailed information about a specific provider.",
"parameters": [ "parameters": [
{ {
"name": "provider_id", "name": "provider_id",
@ -1742,8 +1742,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "List all OpenAI responses.", "summary": "List all responses.",
"description": "List all OpenAI responses.", "description": "List all responses.",
"parameters": [ "parameters": [
{ {
"name": "after", "name": "after",
@ -1817,8 +1817,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "Create a new OpenAI response.", "summary": "Create a model response.",
"description": "Create a new OpenAI response.", "description": "Create a model response.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1830,7 +1830,27 @@
}, },
"required": true "required": true
}, },
"deprecated": false "deprecated": false,
"x-llama-stack-extra-body-params": [
{
"name": "shields",
"schema": {
"type": "array",
"items": {
"oneOf": [
{
"type": "string"
},
{
"$ref": "#/components/schemas/ResponseShieldSpec"
}
]
}
},
"description": "List of shields to apply during response generation. Shields provide safety and content moderation.",
"required": false
}
]
} }
}, },
"/v1/responses/{response_id}": { "/v1/responses/{response_id}": {
@ -1862,8 +1882,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "Retrieve an OpenAI response by its ID.", "summary": "Get a model response.",
"description": "Retrieve an OpenAI response by its ID.", "description": "Get a model response.",
"parameters": [ "parameters": [
{ {
"name": "response_id", "name": "response_id",
@ -1905,8 +1925,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "Delete an OpenAI response by its ID.", "summary": "Delete a response.",
"description": "Delete an OpenAI response by its ID.", "description": "Delete a response.",
"parameters": [ "parameters": [
{ {
"name": "response_id", "name": "response_id",
@ -1950,8 +1970,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "List input items for a given OpenAI response.", "summary": "List input items.",
"description": "List input items for a given OpenAI response.", "description": "List input items.",
"parameters": [ "parameters": [
{ {
"name": "response_id", "name": "response_id",
@ -2043,8 +2063,8 @@
"tags": [ "tags": [
"Safety" "Safety"
], ],
"summary": "Run a shield.", "summary": "Run shield.",
"description": "Run a shield.", "description": "Run shield.\nRun a shield.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -4176,8 +4196,8 @@
"tags": [ "tags": [
"Inspect" "Inspect"
], ],
"summary": "Get the version of the service.", "summary": "Get version.",
"description": "Get the version of the service.", "description": "Get version.\nGet the version of the service.",
"parameters": [], "parameters": [],
"deprecated": false "deprecated": false
} }
@ -7616,6 +7636,21 @@
"title": "OpenAIResponseText", "title": "OpenAIResponseText",
"description": "Text response configuration for OpenAI responses." "description": "Text response configuration for OpenAI responses."
}, },
"ResponseShieldSpec": {
"type": "object",
"properties": {
"type": {
"type": "string",
"description": "The type/identifier of the shield."
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "ResponseShieldSpec",
"description": "Specification for a shield to apply during response generation."
},
"OpenAIResponseInputTool": { "OpenAIResponseInputTool": {
"oneOf": [ "oneOf": [
{ {
@ -12879,16 +12914,18 @@
}, },
{ {
"name": "Files", "name": "Files",
"description": "" "description": "This API is used to upload documents that can be used with other Llama Stack APIs.",
"x-displayName": "Files"
}, },
{ {
"name": "Inference", "name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
"x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings." "x-displayName": "Inference"
}, },
{ {
"name": "Inspect", "name": "Inspect",
"description": "" "description": "APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.",
"x-displayName": "Inspect"
}, },
{ {
"name": "Models", "name": "Models",
@ -12896,17 +12933,18 @@
}, },
{ {
"name": "Prompts", "name": "Prompts",
"description": "", "description": "Protocol for prompt management operations.",
"x-displayName": "Protocol for prompt management operations." "x-displayName": "Prompts"
}, },
{ {
"name": "Providers", "name": "Providers",
"description": "", "description": "Providers API for inspecting, listing, and modifying providers and their configurations.",
"x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations." "x-displayName": "Providers"
}, },
{ {
"name": "Safety", "name": "Safety",
"description": "" "description": "OpenAI-compatible Moderations API.",
"x-displayName": "Safety"
}, },
{ {
"name": "Scoring", "name": "Scoring",

View file

@ -33,8 +33,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: List all chat completions. summary: List chat completions.
description: List all chat completions. description: List chat completions.
parameters: parameters:
- name: after - name: after
in: query in: query
@ -87,10 +87,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: >- summary: Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
description: >- description: >-
Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using Generate an OpenAI-compatible chat completion for the given messages using
the specified model. the specified model.
parameters: [] parameters: []
@ -122,8 +122,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: Describe a chat completion by its ID. summary: Get chat completion.
description: Describe a chat completion by its ID. description: >-
Get chat completion.
Describe a chat completion by its ID.
parameters: parameters:
- name: completion_id - name: completion_id
in: path in: path
@ -153,10 +156,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: >- summary: Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
description: >- description: >-
Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified Generate an OpenAI-compatible completion for the given prompt using the specified
model. model.
parameters: [] parameters: []
@ -603,10 +606,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: >- summary: Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified
model.
description: >- description: >-
Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified Generate OpenAI-compatible embeddings for the given input using the specified
model. model.
parameters: [] parameters: []
@ -639,9 +642,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: List files.
Returns a list of files that belong to the user's organization.
description: >- description: >-
List files.
Returns a list of files that belong to the user's organization. Returns a list of files that belong to the user's organization.
parameters: parameters:
- name: after - name: after
@ -699,11 +703,13 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: Upload file.
Upload a file that can be used across various endpoints.
description: >- description: >-
Upload file.
Upload a file that can be used across various endpoints. Upload a file that can be used across various endpoints.
The file upload should be a multipart form request with: The file upload should be a multipart form request with:
- file: The File object (not file name) to be uploaded. - file: The File object (not file name) to be uploaded.
@ -752,9 +758,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: Retrieve file.
Returns information about a specific file.
description: >- description: >-
Retrieve file.
Returns information about a specific file. Returns information about a specific file.
parameters: parameters:
- name: file_id - name: file_id
@ -786,8 +793,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: Delete a file. summary: Delete file.
description: Delete a file. description: Delete file.
parameters: parameters:
- name: file_id - name: file_id
in: path in: path
@ -819,9 +826,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: Retrieve file content.
Returns the contents of the specified file.
description: >- description: >-
Retrieve file content.
Returns the contents of the specified file. Returns the contents of the specified file.
parameters: parameters:
- name: file_id - name: file_id
@ -854,9 +862,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inspect - Inspect
summary: >- summary: Get health status.
Get the current health status of the service.
description: >- description: >-
Get health status.
Get the current health status of the service. Get the current health status of the service.
parameters: [] parameters: []
deprecated: false deprecated: false
@ -882,9 +891,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inspect - Inspect
summary: >- summary: List routes.
List all available API routes with their methods and implementing providers.
description: >- description: >-
List routes.
List all available API routes with their methods and implementing providers. List all available API routes with their methods and implementing providers.
parameters: [] parameters: []
deprecated: false deprecated: false
@ -933,8 +943,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Models - Models
summary: Register a model. summary: Register model.
description: Register a model. description: >-
Register model.
Register a model.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -964,8 +977,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Models - Models
summary: Get a model by its identifier. summary: Get model.
description: Get a model by its identifier. description: >-
Get model.
Get a model by its identifier.
parameters: parameters:
- name: model_id - name: model_id
in: path in: path
@ -990,8 +1006,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Models - Models
summary: Unregister a model. summary: Unregister model.
description: Unregister a model. description: >-
Unregister model.
Unregister a model.
parameters: parameters:
- name: model_id - name: model_id
in: path in: path
@ -1022,9 +1041,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Safety - Safety
summary: >- summary: Create moderation.
Classifies if text and/or image inputs are potentially harmful.
description: >- description: >-
Create moderation.
Classifies if text and/or image inputs are potentially harmful. Classifies if text and/or image inputs are potentially harmful.
parameters: [] parameters: []
requestBody: requestBody:
@ -1080,8 +1100,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: Create a new prompt. summary: Create prompt.
description: Create a new prompt. description: >-
Create prompt.
Create a new prompt.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -1111,9 +1134,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: >- summary: Get prompt.
Get a prompt by its identifier and optional version.
description: >- description: >-
Get prompt.
Get a prompt by its identifier and optional version. Get a prompt by its identifier and optional version.
parameters: parameters:
- name: prompt_id - name: prompt_id
@ -1151,9 +1175,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: >- summary: Update prompt.
Update an existing prompt (increments version).
description: >- description: >-
Update prompt.
Update an existing prompt (increments version). Update an existing prompt (increments version).
parameters: parameters:
- name: prompt_id - name: prompt_id
@ -1185,8 +1210,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: Delete a prompt. summary: Delete prompt.
description: Delete a prompt. description: >-
Delete prompt.
Delete a prompt.
parameters: parameters:
- name: prompt_id - name: prompt_id
in: path in: path
@ -1217,9 +1245,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: >- summary: Set prompt version.
Set which version of a prompt should be the default in get_prompt (latest).
description: >- description: >-
Set prompt version.
Set which version of a prompt should be the default in get_prompt (latest). Set which version of a prompt should be the default in get_prompt (latest).
parameters: parameters:
- name: prompt_id - name: prompt_id
@ -1257,8 +1286,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: List all versions of a specific prompt. summary: List prompt versions.
description: List all versions of a specific prompt. description: >-
List prompt versions.
List all versions of a specific prompt.
parameters: parameters:
- name: prompt_id - name: prompt_id
in: path in: path
@ -1290,8 +1322,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Providers - Providers
summary: List all available providers. summary: List providers.
description: List all available providers. description: >-
List providers.
List all available providers.
parameters: [] parameters: []
deprecated: false deprecated: false
/v1/providers/{provider_id}: /v1/providers/{provider_id}:
@ -1316,9 +1351,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Providers - Providers
summary: >- summary: Get provider.
Get detailed information about a specific provider.
description: >- description: >-
Get provider.
Get detailed information about a specific provider. Get detailed information about a specific provider.
parameters: parameters:
- name: provider_id - name: provider_id
@ -1349,8 +1385,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: List all OpenAI responses. summary: List all responses.
description: List all OpenAI responses. description: List all responses.
parameters: parameters:
- name: after - name: after
in: query in: query
@ -1401,8 +1437,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: Create a new OpenAI response. summary: Create a model response.
description: Create a new OpenAI response. description: Create a model response.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -1411,6 +1447,18 @@ paths:
$ref: '#/components/schemas/CreateOpenaiResponseRequest' $ref: '#/components/schemas/CreateOpenaiResponseRequest'
required: true required: true
deprecated: false deprecated: false
x-llama-stack-extra-body-params:
- name: shields
schema:
type: array
items:
oneOf:
- type: string
- $ref: '#/components/schemas/ResponseShieldSpec'
description: >-
List of shields to apply during response generation. Shields provide safety
and content moderation.
required: false
/v1/responses/{response_id}: /v1/responses/{response_id}:
get: get:
responses: responses:
@ -1432,8 +1480,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: Retrieve an OpenAI response by its ID. summary: Get a model response.
description: Retrieve an OpenAI response by its ID. description: Get a model response.
parameters: parameters:
- name: response_id - name: response_id
in: path in: path
@ -1463,8 +1511,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: Delete an OpenAI response by its ID. summary: Delete a response.
description: Delete an OpenAI response by its ID. description: Delete a response.
parameters: parameters:
- name: response_id - name: response_id
in: path in: path
@ -1494,10 +1542,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: >- summary: List input items.
List input items for a given OpenAI response. description: List input items.
description: >-
List input items for a given OpenAI response.
parameters: parameters:
- name: response_id - name: response_id
in: path in: path
@ -1566,8 +1612,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Safety - Safety
summary: Run a shield. summary: Run shield.
description: Run a shield. description: >-
Run shield.
Run a shield.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -3123,8 +3172,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inspect - Inspect
summary: Get the version of the service. summary: Get version.
description: Get the version of the service. description: >-
Get version.
Get the version of the service.
parameters: [] parameters: []
deprecated: false deprecated: false
jsonSchemaDialect: >- jsonSchemaDialect: >-
@ -5739,6 +5791,18 @@ components:
title: OpenAIResponseText title: OpenAIResponseText
description: >- description: >-
Text response configuration for OpenAI responses. Text response configuration for OpenAI responses.
ResponseShieldSpec:
type: object
properties:
type:
type: string
description: The type/identifier of the shield.
additionalProperties: false
required:
- type
title: ResponseShieldSpec
description: >-
Specification for a shield to apply during response generation.
OpenAIResponseInputTool: OpenAIResponseInputTool:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch' - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -9725,9 +9789,16 @@ tags:
x-displayName: >- x-displayName: >-
Protocol for conversation management operations. Protocol for conversation management operations.
- name: Files - name: Files
description: '' description: >-
This API is used to upload documents that can be used with other Llama Stack
APIs.
x-displayName: Files
- name: Inference - name: Inference
description: >- description: >-
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
This API provides the raw interface to the underlying models. Two kinds of models This API provides the raw interface to the underlying models. Two kinds of models
are supported: are supported:
@ -9735,23 +9806,25 @@ tags:
- Embedding models: these models generate embeddings to be used for semantic - Embedding models: these models generate embeddings to be used for semantic
search. search.
x-displayName: >- x-displayName: Inference
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
- name: Inspect - name: Inspect
description: '' description: >-
APIs for inspecting the Llama Stack service, including health status, available
API routes with methods and implementing providers.
x-displayName: Inspect
- name: Models - name: Models
description: '' description: ''
- name: Prompts - name: Prompts
description: '' description: >-
x-displayName: >-
Protocol for prompt management operations. Protocol for prompt management operations.
x-displayName: Prompts
- name: Providers - name: Providers
description: '' description: >-
x-displayName: >-
Providers API for inspecting, listing, and modifying providers and their configurations. Providers API for inspecting, listing, and modifying providers and their configurations.
x-displayName: Providers
- name: Safety - name: Safety
description: '' description: OpenAI-compatible Moderations API.
x-displayName: Safety
- name: Scoring - name: Scoring
description: '' description: ''
- name: ScoringFunctions - name: ScoringFunctions

View file

@ -69,8 +69,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "List all chat completions.", "summary": "List chat completions.",
"description": "List all chat completions.", "description": "List chat completions.",
"parameters": [ "parameters": [
{ {
"name": "after", "name": "after",
@ -146,8 +146,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.", "summary": "Create chat completions.",
"description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.", "description": "Create chat completions.\nGenerate an OpenAI-compatible chat completion for the given messages using the specified model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -191,8 +191,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Describe a chat completion by its ID.", "summary": "Get chat completion.",
"description": "Describe a chat completion by its ID.", "description": "Get chat completion.\nDescribe a chat completion by its ID.",
"parameters": [ "parameters": [
{ {
"name": "completion_id", "name": "completion_id",
@ -236,8 +236,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Generate an OpenAI-compatible completion for the given prompt using the specified model.", "summary": "Create completion.",
"description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.", "description": "Create completion.\nGenerate an OpenAI-compatible completion for the given prompt using the specified model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -758,8 +758,8 @@
"tags": [ "tags": [
"Inference" "Inference"
], ],
"summary": "Generate OpenAI-compatible embeddings for the given input using the specified model.", "summary": "Create embeddings.",
"description": "Generate OpenAI-compatible embeddings for the given input using the specified model.", "description": "Create embeddings.\nGenerate OpenAI-compatible embeddings for the given input using the specified model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -803,8 +803,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Returns a list of files that belong to the user's organization.", "summary": "List files.",
"description": "Returns a list of files that belong to the user's organization.", "description": "List files.\nReturns a list of files that belong to the user's organization.",
"parameters": [ "parameters": [
{ {
"name": "after", "name": "after",
@ -873,8 +873,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Upload a file that can be used across various endpoints.", "summary": "Upload file.",
"description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.", "description": "Upload file.\nUpload a file that can be used across various endpoints.\n\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -934,8 +934,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Returns information about a specific file.", "summary": "Retrieve file.",
"description": "Returns information about a specific file.", "description": "Retrieve file.\nReturns information about a specific file.",
"parameters": [ "parameters": [
{ {
"name": "file_id", "name": "file_id",
@ -977,8 +977,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Delete a file.", "summary": "Delete file.",
"description": "Delete a file.", "description": "Delete file.",
"parameters": [ "parameters": [
{ {
"name": "file_id", "name": "file_id",
@ -1022,8 +1022,8 @@
"tags": [ "tags": [
"Files" "Files"
], ],
"summary": "Returns the contents of the specified file.", "summary": "Retrieve file content.",
"description": "Returns the contents of the specified file.", "description": "Retrieve file content.\nReturns the contents of the specified file.",
"parameters": [ "parameters": [
{ {
"name": "file_id", "name": "file_id",
@ -1067,8 +1067,8 @@
"tags": [ "tags": [
"Inspect" "Inspect"
], ],
"summary": "Get the current health status of the service.", "summary": "Get health status.",
"description": "Get the current health status of the service.", "description": "Get health status.\nGet the current health status of the service.",
"parameters": [], "parameters": [],
"deprecated": false "deprecated": false
} }
@ -1102,8 +1102,8 @@
"tags": [ "tags": [
"Inspect" "Inspect"
], ],
"summary": "List all available API routes with their methods and implementing providers.", "summary": "List routes.",
"description": "List all available API routes with their methods and implementing providers.", "description": "List routes.\nList all available API routes with their methods and implementing providers.",
"parameters": [], "parameters": [],
"deprecated": false "deprecated": false
} }
@ -1170,8 +1170,8 @@
"tags": [ "tags": [
"Models" "Models"
], ],
"summary": "Register a model.", "summary": "Register model.",
"description": "Register a model.", "description": "Register model.\nRegister a model.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1215,8 +1215,8 @@
"tags": [ "tags": [
"Models" "Models"
], ],
"summary": "Get a model by its identifier.", "summary": "Get model.",
"description": "Get a model by its identifier.", "description": "Get model.\nGet a model by its identifier.",
"parameters": [ "parameters": [
{ {
"name": "model_id", "name": "model_id",
@ -1251,8 +1251,8 @@
"tags": [ "tags": [
"Models" "Models"
], ],
"summary": "Unregister a model.", "summary": "Unregister model.",
"description": "Unregister a model.", "description": "Unregister model.\nUnregister a model.",
"parameters": [ "parameters": [
{ {
"name": "model_id", "name": "model_id",
@ -1296,8 +1296,8 @@
"tags": [ "tags": [
"Safety" "Safety"
], ],
"summary": "Classifies if text and/or image inputs are potentially harmful.", "summary": "Create moderation.",
"description": "Classifies if text and/or image inputs are potentially harmful.", "description": "Create moderation.\nClassifies if text and/or image inputs are potentially harmful.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1374,8 +1374,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Create a new prompt.", "summary": "Create prompt.",
"description": "Create a new prompt.", "description": "Create prompt.\nCreate a new prompt.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1419,8 +1419,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Get a prompt by its identifier and optional version.", "summary": "Get prompt.",
"description": "Get a prompt by its identifier and optional version.", "description": "Get prompt.\nGet a prompt by its identifier and optional version.",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1471,8 +1471,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Update an existing prompt (increments version).", "summary": "Update prompt.",
"description": "Update an existing prompt (increments version).", "description": "Update prompt.\nUpdate an existing prompt (increments version).",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1517,8 +1517,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Delete a prompt.", "summary": "Delete prompt.",
"description": "Delete a prompt.", "description": "Delete prompt.\nDelete a prompt.",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1562,8 +1562,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "Set which version of a prompt should be the default in get_prompt (latest).", "summary": "Set prompt version.",
"description": "Set which version of a prompt should be the default in get_prompt (latest).", "description": "Set prompt version.\nSet which version of a prompt should be the default in get_prompt (latest).",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1617,8 +1617,8 @@
"tags": [ "tags": [
"Prompts" "Prompts"
], ],
"summary": "List all versions of a specific prompt.", "summary": "List prompt versions.",
"description": "List all versions of a specific prompt.", "description": "List prompt versions.\nList all versions of a specific prompt.",
"parameters": [ "parameters": [
{ {
"name": "prompt_id", "name": "prompt_id",
@ -1662,8 +1662,8 @@
"tags": [ "tags": [
"Providers" "Providers"
], ],
"summary": "List all available providers.", "summary": "List providers.",
"description": "List all available providers.", "description": "List providers.\nList all available providers.",
"parameters": [], "parameters": [],
"deprecated": false "deprecated": false
} }
@ -1697,8 +1697,8 @@
"tags": [ "tags": [
"Providers" "Providers"
], ],
"summary": "Get detailed information about a specific provider.", "summary": "Get provider.",
"description": "Get detailed information about a specific provider.", "description": "Get provider.\nGet detailed information about a specific provider.",
"parameters": [ "parameters": [
{ {
"name": "provider_id", "name": "provider_id",
@ -1742,8 +1742,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "List all OpenAI responses.", "summary": "List all responses.",
"description": "List all OpenAI responses.", "description": "List all responses.",
"parameters": [ "parameters": [
{ {
"name": "after", "name": "after",
@ -1817,8 +1817,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "Create a new OpenAI response.", "summary": "Create a model response.",
"description": "Create a new OpenAI response.", "description": "Create a model response.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -1830,7 +1830,27 @@
}, },
"required": true "required": true
}, },
"deprecated": false "deprecated": false,
"x-llama-stack-extra-body-params": [
{
"name": "shields",
"schema": {
"type": "array",
"items": {
"oneOf": [
{
"type": "string"
},
{
"$ref": "#/components/schemas/ResponseShieldSpec"
}
]
}
},
"description": "List of shields to apply during response generation. Shields provide safety and content moderation.",
"required": false
}
]
} }
}, },
"/v1/responses/{response_id}": { "/v1/responses/{response_id}": {
@ -1862,8 +1882,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "Retrieve an OpenAI response by its ID.", "summary": "Get a model response.",
"description": "Retrieve an OpenAI response by its ID.", "description": "Get a model response.",
"parameters": [ "parameters": [
{ {
"name": "response_id", "name": "response_id",
@ -1905,8 +1925,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "Delete an OpenAI response by its ID.", "summary": "Delete a response.",
"description": "Delete an OpenAI response by its ID.", "description": "Delete a response.",
"parameters": [ "parameters": [
{ {
"name": "response_id", "name": "response_id",
@ -1950,8 +1970,8 @@
"tags": [ "tags": [
"Agents" "Agents"
], ],
"summary": "List input items for a given OpenAI response.", "summary": "List input items.",
"description": "List input items for a given OpenAI response.", "description": "List input items.",
"parameters": [ "parameters": [
{ {
"name": "response_id", "name": "response_id",
@ -2043,8 +2063,8 @@
"tags": [ "tags": [
"Safety" "Safety"
], ],
"summary": "Run a shield.", "summary": "Run shield.",
"description": "Run a shield.", "description": "Run shield.\nRun a shield.",
"parameters": [], "parameters": [],
"requestBody": { "requestBody": {
"content": { "content": {
@ -4176,8 +4196,8 @@
"tags": [ "tags": [
"Inspect" "Inspect"
], ],
"summary": "Get the version of the service.", "summary": "Get version.",
"description": "Get the version of the service.", "description": "Get version.\nGet the version of the service.",
"parameters": [], "parameters": [],
"deprecated": false "deprecated": false
} }
@ -9625,6 +9645,21 @@
"title": "OpenAIResponseText", "title": "OpenAIResponseText",
"description": "Text response configuration for OpenAI responses." "description": "Text response configuration for OpenAI responses."
}, },
"ResponseShieldSpec": {
"type": "object",
"properties": {
"type": {
"type": "string",
"description": "The type/identifier of the shield."
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "ResponseShieldSpec",
"description": "Specification for a shield to apply during response generation."
},
"OpenAIResponseInputTool": { "OpenAIResponseInputTool": {
"oneOf": [ "oneOf": [
{ {
@ -18452,16 +18487,18 @@
}, },
{ {
"name": "Files", "name": "Files",
"description": "" "description": "This API is used to upload documents that can be used with other Llama Stack APIs.",
"x-displayName": "Files"
}, },
{ {
"name": "Inference", "name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.", "description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
"x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings." "x-displayName": "Inference"
}, },
{ {
"name": "Inspect", "name": "Inspect",
"description": "" "description": "APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.",
"x-displayName": "Inspect"
}, },
{ {
"name": "Models", "name": "Models",
@ -18473,17 +18510,18 @@
}, },
{ {
"name": "Prompts", "name": "Prompts",
"description": "", "description": "Protocol for prompt management operations.",
"x-displayName": "Protocol for prompt management operations." "x-displayName": "Prompts"
}, },
{ {
"name": "Providers", "name": "Providers",
"description": "", "description": "Providers API for inspecting, listing, and modifying providers and their configurations.",
"x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations." "x-displayName": "Providers"
}, },
{ {
"name": "Safety", "name": "Safety",
"description": "" "description": "OpenAI-compatible Moderations API.",
"x-displayName": "Safety"
}, },
{ {
"name": "Scoring", "name": "Scoring",

View file

@ -36,8 +36,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: List all chat completions. summary: List chat completions.
description: List all chat completions. description: List chat completions.
parameters: parameters:
- name: after - name: after
in: query in: query
@ -90,10 +90,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: >- summary: Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
description: >- description: >-
Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using Generate an OpenAI-compatible chat completion for the given messages using
the specified model. the specified model.
parameters: [] parameters: []
@ -125,8 +125,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: Describe a chat completion by its ID. summary: Get chat completion.
description: Describe a chat completion by its ID. description: >-
Get chat completion.
Describe a chat completion by its ID.
parameters: parameters:
- name: completion_id - name: completion_id
in: path in: path
@ -156,10 +159,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: >- summary: Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
description: >- description: >-
Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified Generate an OpenAI-compatible completion for the given prompt using the specified
model. model.
parameters: [] parameters: []
@ -606,10 +609,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inference - Inference
summary: >- summary: Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified
model.
description: >- description: >-
Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified Generate OpenAI-compatible embeddings for the given input using the specified
model. model.
parameters: [] parameters: []
@ -642,9 +645,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: List files.
Returns a list of files that belong to the user's organization.
description: >- description: >-
List files.
Returns a list of files that belong to the user's organization. Returns a list of files that belong to the user's organization.
parameters: parameters:
- name: after - name: after
@ -702,11 +706,13 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: Upload file.
Upload a file that can be used across various endpoints.
description: >- description: >-
Upload file.
Upload a file that can be used across various endpoints. Upload a file that can be used across various endpoints.
The file upload should be a multipart form request with: The file upload should be a multipart form request with:
- file: The File object (not file name) to be uploaded. - file: The File object (not file name) to be uploaded.
@ -755,9 +761,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: Retrieve file.
Returns information about a specific file.
description: >- description: >-
Retrieve file.
Returns information about a specific file. Returns information about a specific file.
parameters: parameters:
- name: file_id - name: file_id
@ -789,8 +796,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: Delete a file. summary: Delete file.
description: Delete a file. description: Delete file.
parameters: parameters:
- name: file_id - name: file_id
in: path in: path
@ -822,9 +829,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Files - Files
summary: >- summary: Retrieve file content.
Returns the contents of the specified file.
description: >- description: >-
Retrieve file content.
Returns the contents of the specified file. Returns the contents of the specified file.
parameters: parameters:
- name: file_id - name: file_id
@ -857,9 +865,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inspect - Inspect
summary: >- summary: Get health status.
Get the current health status of the service.
description: >- description: >-
Get health status.
Get the current health status of the service. Get the current health status of the service.
parameters: [] parameters: []
deprecated: false deprecated: false
@ -885,9 +894,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inspect - Inspect
summary: >- summary: List routes.
List all available API routes with their methods and implementing providers.
description: >- description: >-
List routes.
List all available API routes with their methods and implementing providers. List all available API routes with their methods and implementing providers.
parameters: [] parameters: []
deprecated: false deprecated: false
@ -936,8 +946,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Models - Models
summary: Register a model. summary: Register model.
description: Register a model. description: >-
Register model.
Register a model.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -967,8 +980,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Models - Models
summary: Get a model by its identifier. summary: Get model.
description: Get a model by its identifier. description: >-
Get model.
Get a model by its identifier.
parameters: parameters:
- name: model_id - name: model_id
in: path in: path
@ -993,8 +1009,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Models - Models
summary: Unregister a model. summary: Unregister model.
description: Unregister a model. description: >-
Unregister model.
Unregister a model.
parameters: parameters:
- name: model_id - name: model_id
in: path in: path
@ -1025,9 +1044,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Safety - Safety
summary: >- summary: Create moderation.
Classifies if text and/or image inputs are potentially harmful.
description: >- description: >-
Create moderation.
Classifies if text and/or image inputs are potentially harmful. Classifies if text and/or image inputs are potentially harmful.
parameters: [] parameters: []
requestBody: requestBody:
@ -1083,8 +1103,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: Create a new prompt. summary: Create prompt.
description: Create a new prompt. description: >-
Create prompt.
Create a new prompt.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -1114,9 +1137,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: >- summary: Get prompt.
Get a prompt by its identifier and optional version.
description: >- description: >-
Get prompt.
Get a prompt by its identifier and optional version. Get a prompt by its identifier and optional version.
parameters: parameters:
- name: prompt_id - name: prompt_id
@ -1154,9 +1178,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: >- summary: Update prompt.
Update an existing prompt (increments version).
description: >- description: >-
Update prompt.
Update an existing prompt (increments version). Update an existing prompt (increments version).
parameters: parameters:
- name: prompt_id - name: prompt_id
@ -1188,8 +1213,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: Delete a prompt. summary: Delete prompt.
description: Delete a prompt. description: >-
Delete prompt.
Delete a prompt.
parameters: parameters:
- name: prompt_id - name: prompt_id
in: path in: path
@ -1220,9 +1248,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: >- summary: Set prompt version.
Set which version of a prompt should be the default in get_prompt (latest).
description: >- description: >-
Set prompt version.
Set which version of a prompt should be the default in get_prompt (latest). Set which version of a prompt should be the default in get_prompt (latest).
parameters: parameters:
- name: prompt_id - name: prompt_id
@ -1260,8 +1289,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Prompts - Prompts
summary: List all versions of a specific prompt. summary: List prompt versions.
description: List all versions of a specific prompt. description: >-
List prompt versions.
List all versions of a specific prompt.
parameters: parameters:
- name: prompt_id - name: prompt_id
in: path in: path
@ -1293,8 +1325,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Providers - Providers
summary: List all available providers. summary: List providers.
description: List all available providers. description: >-
List providers.
List all available providers.
parameters: [] parameters: []
deprecated: false deprecated: false
/v1/providers/{provider_id}: /v1/providers/{provider_id}:
@ -1319,9 +1354,10 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Providers - Providers
summary: >- summary: Get provider.
Get detailed information about a specific provider.
description: >- description: >-
Get provider.
Get detailed information about a specific provider. Get detailed information about a specific provider.
parameters: parameters:
- name: provider_id - name: provider_id
@ -1352,8 +1388,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: List all OpenAI responses. summary: List all responses.
description: List all OpenAI responses. description: List all responses.
parameters: parameters:
- name: after - name: after
in: query in: query
@ -1404,8 +1440,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: Create a new OpenAI response. summary: Create a model response.
description: Create a new OpenAI response. description: Create a model response.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -1414,6 +1450,18 @@ paths:
$ref: '#/components/schemas/CreateOpenaiResponseRequest' $ref: '#/components/schemas/CreateOpenaiResponseRequest'
required: true required: true
deprecated: false deprecated: false
x-llama-stack-extra-body-params:
- name: shields
schema:
type: array
items:
oneOf:
- type: string
- $ref: '#/components/schemas/ResponseShieldSpec'
description: >-
List of shields to apply during response generation. Shields provide safety
and content moderation.
required: false
/v1/responses/{response_id}: /v1/responses/{response_id}:
get: get:
responses: responses:
@ -1435,8 +1483,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: Retrieve an OpenAI response by its ID. summary: Get a model response.
description: Retrieve an OpenAI response by its ID. description: Get a model response.
parameters: parameters:
- name: response_id - name: response_id
in: path in: path
@ -1466,8 +1514,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: Delete an OpenAI response by its ID. summary: Delete a response.
description: Delete an OpenAI response by its ID. description: Delete a response.
parameters: parameters:
- name: response_id - name: response_id
in: path in: path
@ -1497,10 +1545,8 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Agents - Agents
summary: >- summary: List input items.
List input items for a given OpenAI response. description: List input items.
description: >-
List input items for a given OpenAI response.
parameters: parameters:
- name: response_id - name: response_id
in: path in: path
@ -1569,8 +1615,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Safety - Safety
summary: Run a shield. summary: Run shield.
description: Run a shield. description: >-
Run shield.
Run a shield.
parameters: [] parameters: []
requestBody: requestBody:
content: content:
@ -3126,8 +3175,11 @@ paths:
$ref: '#/components/responses/DefaultError' $ref: '#/components/responses/DefaultError'
tags: tags:
- Inspect - Inspect
summary: Get the version of the service. summary: Get version.
description: Get the version of the service. description: >-
Get version.
Get the version of the service.
parameters: [] parameters: []
deprecated: false deprecated: false
/v1beta/datasetio/append-rows/{dataset_id}: /v1beta/datasetio/append-rows/{dataset_id}:
@ -7184,6 +7236,18 @@ components:
title: OpenAIResponseText title: OpenAIResponseText
description: >- description: >-
Text response configuration for OpenAI responses. Text response configuration for OpenAI responses.
ResponseShieldSpec:
type: object
properties:
type:
type: string
description: The type/identifier of the shield.
additionalProperties: false
required:
- type
title: ResponseShieldSpec
description: >-
Specification for a shield to apply during response generation.
OpenAIResponseInputTool: OpenAIResponseInputTool:
oneOf: oneOf:
- $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch' - $ref: '#/components/schemas/OpenAIResponseInputToolWebSearch'
@ -13771,9 +13835,16 @@ tags:
x-displayName: >- x-displayName: >-
Llama Stack Evaluation API for running evaluations on model and agent candidates. Llama Stack Evaluation API for running evaluations on model and agent candidates.
- name: Files - name: Files
description: '' description: >-
This API is used to upload documents that can be used with other Llama Stack
APIs.
x-displayName: Files
- name: Inference - name: Inference
description: >- description: >-
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
This API provides the raw interface to the underlying models. Two kinds of models This API provides the raw interface to the underlying models. Two kinds of models
are supported: are supported:
@ -13781,25 +13852,27 @@ tags:
- Embedding models: these models generate embeddings to be used for semantic - Embedding models: these models generate embeddings to be used for semantic
search. search.
x-displayName: >- x-displayName: Inference
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
- name: Inspect - name: Inspect
description: '' description: >-
APIs for inspecting the Llama Stack service, including health status, available
API routes with methods and implementing providers.
x-displayName: Inspect
- name: Models - name: Models
description: '' description: ''
- name: PostTraining (Coming Soon) - name: PostTraining (Coming Soon)
description: '' description: ''
- name: Prompts - name: Prompts
description: '' description: >-
x-displayName: >-
Protocol for prompt management operations. Protocol for prompt management operations.
x-displayName: Prompts
- name: Providers - name: Providers
description: '' description: >-
x-displayName: >-
Providers API for inspecting, listing, and modifying providers and their configurations. Providers API for inspecting, listing, and modifying providers and their configurations.
x-displayName: Providers
- name: Safety - name: Safety
description: '' description: OpenAI-compatible Moderations API.
x-displayName: Safety
- name: Scoring - name: Scoring
description: '' description: ''
- name: ScoringFunctions - name: ScoringFunctions

View file

@ -28,7 +28,7 @@ from llama_stack.apis.inference import (
from llama_stack.apis.safety import SafetyViolation from llama_stack.apis.safety import SafetyViolation
from llama_stack.apis.tools import ToolDef from llama_stack.apis.tools import ToolDef
from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA from llama_stack.apis.version import LLAMA_STACK_API_V1, LLAMA_STACK_API_V1ALPHA
from llama_stack.schema_utils import json_schema_type, register_schema, webmethod from llama_stack.schema_utils import ExtraBodyField, json_schema_type, register_schema, webmethod
from .openai_responses import ( from .openai_responses import (
ListOpenAIResponseInputItem, ListOpenAIResponseInputItem,
@ -42,6 +42,20 @@ from .openai_responses import (
) )
@json_schema_type
class ResponseShieldSpec(BaseModel):
"""Specification for a shield to apply during response generation.
:param type: The type/identifier of the shield.
"""
type: str
# TODO: more fields to be added for shield configuration
ResponseShield = str | ResponseShieldSpec
class Attachment(BaseModel): class Attachment(BaseModel):
"""An attachment to an agent turn. """An attachment to an agent turn.
@ -783,7 +797,7 @@ class Agents(Protocol):
self, self,
response_id: str, response_id: str,
) -> OpenAIResponseObject: ) -> OpenAIResponseObject:
"""Retrieve an OpenAI response by its ID. """Get a model response.
:param response_id: The ID of the OpenAI response to retrieve. :param response_id: The ID of the OpenAI response to retrieve.
:returns: An OpenAIResponseObject. :returns: An OpenAIResponseObject.
@ -805,13 +819,20 @@ class Agents(Protocol):
tools: list[OpenAIResponseInputTool] | None = None, tools: list[OpenAIResponseInputTool] | None = None,
include: list[str] | None = None, include: list[str] | None = None,
max_infer_iters: int | None = 10, # this is an extension to the OpenAI API max_infer_iters: int | None = 10, # this is an extension to the OpenAI API
shields: Annotated[
list[ResponseShield] | None,
ExtraBodyField(
"List of shields to apply during response generation. Shields provide safety and content moderation."
),
] = None,
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]: ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
"""Create a new OpenAI response. """Create a model response.
:param input: Input message(s) to create the response. :param input: Input message(s) to create the response.
:param model: The underlying LLM used for completions. :param model: The underlying LLM used for completions.
:param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses. :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
:param include: (Optional) Additional fields to include in the response. :param include: (Optional) Additional fields to include in the response.
:param shields: (Optional) List of shields to apply during response generation. Can be shield IDs (strings) or shield specifications.
:returns: An OpenAIResponseObject. :returns: An OpenAIResponseObject.
""" """
... ...
@ -825,7 +846,7 @@ class Agents(Protocol):
model: str | None = None, model: str | None = None,
order: Order | None = Order.desc, order: Order | None = Order.desc,
) -> ListOpenAIResponseObject: ) -> ListOpenAIResponseObject:
"""List all OpenAI responses. """List all responses.
:param after: The ID of the last response to return. :param after: The ID of the last response to return.
:param limit: The number of responses to return. :param limit: The number of responses to return.
@ -848,7 +869,7 @@ class Agents(Protocol):
limit: int | None = 20, limit: int | None = 20,
order: Order | None = Order.desc, order: Order | None = Order.desc,
) -> ListOpenAIResponseInputItem: ) -> ListOpenAIResponseInputItem:
"""List input items for a given OpenAI response. """List input items.
:param response_id: The ID of the response to retrieve input items for. :param response_id: The ID of the response to retrieve input items for.
:param after: An item ID to list items after, used for pagination. :param after: An item ID to list items after, used for pagination.
@ -863,7 +884,7 @@ class Agents(Protocol):
@webmethod(route="/openai/v1/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True) @webmethod(route="/openai/v1/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
@webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1) @webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject: async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
"""Delete an OpenAI response by its ID. """Delete a response.
:param response_id: The ID of the OpenAI response to delete. :param response_id: The ID of the OpenAI response to delete.
:returns: An OpenAIDeleteResponseObject :returns: An OpenAIDeleteResponseObject

View file

@ -104,6 +104,11 @@ class OpenAIFileDeleteResponse(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @trace_protocol
class Files(Protocol): class Files(Protocol):
"""Files
This API is used to upload documents that can be used with other Llama Stack APIs.
"""
# OpenAI Files API Endpoints # OpenAI Files API Endpoints
@webmethod(route="/openai/v1/files", method="POST", level=LLAMA_STACK_API_V1, deprecated=True) @webmethod(route="/openai/v1/files", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
@webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1)
@ -113,7 +118,8 @@ class Files(Protocol):
purpose: Annotated[OpenAIFilePurpose, Form()], purpose: Annotated[OpenAIFilePurpose, Form()],
expires_after: Annotated[ExpiresAfter | None, Form()] = None, expires_after: Annotated[ExpiresAfter | None, Form()] = None,
) -> OpenAIFileObject: ) -> OpenAIFileObject:
""" """Upload file.
Upload a file that can be used across various endpoints. Upload a file that can be used across various endpoints.
The file upload should be a multipart form request with: The file upload should be a multipart form request with:
@ -137,7 +143,8 @@ class Files(Protocol):
order: Order | None = Order.desc, order: Order | None = Order.desc,
purpose: OpenAIFilePurpose | None = None, purpose: OpenAIFilePurpose | None = None,
) -> ListOpenAIFileResponse: ) -> ListOpenAIFileResponse:
""" """List files.
Returns a list of files that belong to the user's organization. Returns a list of files that belong to the user's organization.
:param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list. For instance, if you make a list request and receive 100 objects, ending with obj_foo, your subsequent call can include after=obj_foo in order to fetch the next page of the list. :param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list. For instance, if you make a list request and receive 100 objects, ending with obj_foo, your subsequent call can include after=obj_foo in order to fetch the next page of the list.
@ -154,7 +161,8 @@ class Files(Protocol):
self, self,
file_id: str, file_id: str,
) -> OpenAIFileObject: ) -> OpenAIFileObject:
""" """Retrieve file.
Returns information about a specific file. Returns information about a specific file.
:param file_id: The ID of the file to use for this request. :param file_id: The ID of the file to use for this request.
@ -168,8 +176,7 @@ class Files(Protocol):
self, self,
file_id: str, file_id: str,
) -> OpenAIFileDeleteResponse: ) -> OpenAIFileDeleteResponse:
""" """Delete file.
Delete a file.
:param file_id: The ID of the file to use for this request. :param file_id: The ID of the file to use for this request.
:returns: An OpenAIFileDeleteResponse indicating successful deletion. :returns: An OpenAIFileDeleteResponse indicating successful deletion.
@ -182,7 +189,8 @@ class Files(Protocol):
self, self,
file_id: str, file_id: str,
) -> Response: ) -> Response:
""" """Retrieve file content.
Returns the contents of the specified file. Returns the contents of the specified file.
:param file_id: The ID of the file to use for this request. :param file_id: The ID of the file to use for this request.

View file

@ -1053,7 +1053,9 @@ class InferenceProvider(Protocol):
# for fill-in-the-middle type completion # for fill-in-the-middle type completion
suffix: str | None = None, suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
"""Generate an OpenAI-compatible completion for the given prompt using the specified model. """Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified model.
:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param prompt: The prompt to generate a completion for. :param prompt: The prompt to generate a completion for.
@ -1105,7 +1107,9 @@ class InferenceProvider(Protocol):
top_p: float | None = None, top_p: float | None = None,
user: str | None = None, user: str | None = None,
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]: ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
"""Generate an OpenAI-compatible chat completion for the given messages using the specified model. """Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using the specified model.
:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint. :param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param messages: List of messages in the conversation. :param messages: List of messages in the conversation.
@ -1144,7 +1148,9 @@ class InferenceProvider(Protocol):
dimensions: int | None = None, dimensions: int | None = None,
user: str | None = None, user: str | None = None,
) -> OpenAIEmbeddingsResponse: ) -> OpenAIEmbeddingsResponse:
"""Generate OpenAI-compatible embeddings for the given input using the specified model. """Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified model.
:param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint. :param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
:param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings. :param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
@ -1157,7 +1163,9 @@ class InferenceProvider(Protocol):
class Inference(InferenceProvider): class Inference(InferenceProvider):
"""Llama Stack Inference API for generating completions, chat completions, and embeddings. """Inference
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported: This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions. - LLM models: these models generate "raw" and "chat" (conversational) completions.
@ -1173,7 +1181,7 @@ class Inference(InferenceProvider):
model: str | None = None, model: str | None = None,
order: Order | None = Order.desc, order: Order | None = Order.desc,
) -> ListOpenAIChatCompletionResponse: ) -> ListOpenAIChatCompletionResponse:
"""List all chat completions. """List chat completions.
:param after: The ID of the last chat completion to return. :param after: The ID of the last chat completion to return.
:param limit: The maximum number of chat completions to return. :param limit: The maximum number of chat completions to return.
@ -1188,7 +1196,9 @@ class Inference(InferenceProvider):
) )
@webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages: async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
"""Describe a chat completion by its ID. """Get chat completion.
Describe a chat completion by its ID.
:param completion_id: ID of the chat completion. :param completion_id: ID of the chat completion.
:returns: A OpenAICompletionWithInputMessages. :returns: A OpenAICompletionWithInputMessages.

View file

@ -58,9 +58,16 @@ class ListRoutesResponse(BaseModel):
@runtime_checkable @runtime_checkable
class Inspect(Protocol): class Inspect(Protocol):
"""Inspect
APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.
"""
@webmethod(route="/inspect/routes", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/inspect/routes", method="GET", level=LLAMA_STACK_API_V1)
async def list_routes(self) -> ListRoutesResponse: async def list_routes(self) -> ListRoutesResponse:
"""List all available API routes with their methods and implementing providers. """List routes.
List all available API routes with their methods and implementing providers.
:returns: Response containing information about all available routes. :returns: Response containing information about all available routes.
""" """
@ -68,7 +75,9 @@ class Inspect(Protocol):
@webmethod(route="/health", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/health", method="GET", level=LLAMA_STACK_API_V1)
async def health(self) -> HealthInfo: async def health(self) -> HealthInfo:
"""Get the current health status of the service. """Get health status.
Get the current health status of the service.
:returns: Health information indicating if the service is operational. :returns: Health information indicating if the service is operational.
""" """
@ -76,7 +85,9 @@ class Inspect(Protocol):
@webmethod(route="/version", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/version", method="GET", level=LLAMA_STACK_API_V1)
async def version(self) -> VersionInfo: async def version(self) -> VersionInfo:
"""Get the version of the service. """Get version.
Get the version of the service.
:returns: Version information containing the service version number. :returns: Version information containing the service version number.
""" """

View file

@ -124,7 +124,9 @@ class Models(Protocol):
self, self,
model_id: str, model_id: str,
) -> Model: ) -> Model:
"""Get a model by its identifier. """Get model.
Get a model by its identifier.
:param model_id: The identifier of the model to get. :param model_id: The identifier of the model to get.
:returns: A Model. :returns: A Model.
@ -140,7 +142,9 @@ class Models(Protocol):
metadata: dict[str, Any] | None = None, metadata: dict[str, Any] | None = None,
model_type: ModelType | None = None, model_type: ModelType | None = None,
) -> Model: ) -> Model:
"""Register a model. """Register model.
Register a model.
:param model_id: The identifier of the model to register. :param model_id: The identifier of the model to register.
:param provider_model_id: The identifier of the model in the provider. :param provider_model_id: The identifier of the model in the provider.
@ -156,7 +160,9 @@ class Models(Protocol):
self, self,
model_id: str, model_id: str,
) -> None: ) -> None:
"""Unregister a model. """Unregister model.
Unregister a model.
:param model_id: The identifier of the model to unregister. :param model_id: The identifier of the model to unregister.
""" """

View file

@ -94,7 +94,9 @@ class ListPromptsResponse(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @trace_protocol
class Prompts(Protocol): class Prompts(Protocol):
"""Protocol for prompt management operations.""" """Prompts
Protocol for prompt management operations."""
@webmethod(route="/prompts", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/prompts", method="GET", level=LLAMA_STACK_API_V1)
async def list_prompts(self) -> ListPromptsResponse: async def list_prompts(self) -> ListPromptsResponse:
@ -109,7 +111,9 @@ class Prompts(Protocol):
self, self,
prompt_id: str, prompt_id: str,
) -> ListPromptsResponse: ) -> ListPromptsResponse:
"""List all versions of a specific prompt. """List prompt versions.
List all versions of a specific prompt.
:param prompt_id: The identifier of the prompt to list versions for. :param prompt_id: The identifier of the prompt to list versions for.
:returns: A ListPromptsResponse containing all versions of the prompt. :returns: A ListPromptsResponse containing all versions of the prompt.
@ -122,7 +126,9 @@ class Prompts(Protocol):
prompt_id: str, prompt_id: str,
version: int | None = None, version: int | None = None,
) -> Prompt: ) -> Prompt:
"""Get a prompt by its identifier and optional version. """Get prompt.
Get a prompt by its identifier and optional version.
:param prompt_id: The identifier of the prompt to get. :param prompt_id: The identifier of the prompt to get.
:param version: The version of the prompt to get (defaults to latest). :param version: The version of the prompt to get (defaults to latest).
@ -136,7 +142,9 @@ class Prompts(Protocol):
prompt: str, prompt: str,
variables: list[str] | None = None, variables: list[str] | None = None,
) -> Prompt: ) -> Prompt:
"""Create a new prompt. """Create prompt.
Create a new prompt.
:param prompt: The prompt text content with variable placeholders. :param prompt: The prompt text content with variable placeholders.
:param variables: List of variable names that can be used in the prompt template. :param variables: List of variable names that can be used in the prompt template.
@ -153,7 +161,9 @@ class Prompts(Protocol):
variables: list[str] | None = None, variables: list[str] | None = None,
set_as_default: bool = True, set_as_default: bool = True,
) -> Prompt: ) -> Prompt:
"""Update an existing prompt (increments version). """Update prompt.
Update an existing prompt (increments version).
:param prompt_id: The identifier of the prompt to update. :param prompt_id: The identifier of the prompt to update.
:param prompt: The updated prompt text content. :param prompt: The updated prompt text content.
@ -169,7 +179,9 @@ class Prompts(Protocol):
self, self,
prompt_id: str, prompt_id: str,
) -> None: ) -> None:
"""Delete a prompt. """Delete prompt.
Delete a prompt.
:param prompt_id: The identifier of the prompt to delete. :param prompt_id: The identifier of the prompt to delete.
""" """
@ -181,7 +193,9 @@ class Prompts(Protocol):
prompt_id: str, prompt_id: str,
version: int, version: int,
) -> Prompt: ) -> Prompt:
"""Set which version of a prompt should be the default in get_prompt (latest). """Set prompt version.
Set which version of a prompt should be the default in get_prompt (latest).
:param prompt_id: The identifier of the prompt. :param prompt_id: The identifier of the prompt.
:param version: The version to set as default. :param version: The version to set as default.

View file

@ -42,13 +42,16 @@ class ListProvidersResponse(BaseModel):
@runtime_checkable @runtime_checkable
class Providers(Protocol): class Providers(Protocol):
""" """Providers
Providers API for inspecting, listing, and modifying providers and their configurations. Providers API for inspecting, listing, and modifying providers and their configurations.
""" """
@webmethod(route="/providers", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/providers", method="GET", level=LLAMA_STACK_API_V1)
async def list_providers(self) -> ListProvidersResponse: async def list_providers(self) -> ListProvidersResponse:
"""List all available providers. """List providers.
List all available providers.
:returns: A ListProvidersResponse containing information about all providers. :returns: A ListProvidersResponse containing information about all providers.
""" """
@ -56,7 +59,9 @@ class Providers(Protocol):
@webmethod(route="/providers/{provider_id}", method="GET", level=LLAMA_STACK_API_V1) @webmethod(route="/providers/{provider_id}", method="GET", level=LLAMA_STACK_API_V1)
async def inspect_provider(self, provider_id: str) -> ProviderInfo: async def inspect_provider(self, provider_id: str) -> ProviderInfo:
"""Get detailed information about a specific provider. """Get provider.
Get detailed information about a specific provider.
:param provider_id: The ID of the provider to inspect. :param provider_id: The ID of the provider to inspect.
:returns: A ProviderInfo object containing the provider's details. :returns: A ProviderInfo object containing the provider's details.

View file

@ -96,6 +96,11 @@ class ShieldStore(Protocol):
@runtime_checkable @runtime_checkable
@trace_protocol @trace_protocol
class Safety(Protocol): class Safety(Protocol):
"""Safety
OpenAI-compatible Moderations API.
"""
shield_store: ShieldStore shield_store: ShieldStore
@webmethod(route="/safety/run-shield", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/safety/run-shield", method="POST", level=LLAMA_STACK_API_V1)
@ -105,7 +110,9 @@ class Safety(Protocol):
messages: list[Message], messages: list[Message],
params: dict[str, Any], params: dict[str, Any],
) -> RunShieldResponse: ) -> RunShieldResponse:
"""Run a shield. """Run shield.
Run a shield.
:param shield_id: The identifier of the shield to run. :param shield_id: The identifier of the shield to run.
:param messages: The messages to run the shield on. :param messages: The messages to run the shield on.
@ -117,7 +124,9 @@ class Safety(Protocol):
@webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True) @webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
@webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1) @webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject: async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
"""Classifies if text and/or image inputs are potentially harmful. """Create moderation.
Classifies if text and/or image inputs are potentially harmful.
:param input: Input (or inputs) to classify. :param input: Input (or inputs) to classify.
Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
:param model: The content moderation model you would like to use. :param model: The content moderation model you would like to use.

View file

@ -6,11 +6,18 @@
import argparse import argparse
import os import os
import ssl
import subprocess import subprocess
from pathlib import Path from pathlib import Path
import uvicorn
import yaml
from llama_stack.cli.stack.utils import ImageType from llama_stack.cli.stack.utils import ImageType
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
from llama_stack.core.datatypes import LoggingConfig, StackRunConfig
from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars, validate_env_pair
from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
from llama_stack.log import get_logger from llama_stack.log import get_logger
REPO_ROOT = Path(__file__).parent.parent.parent.parent REPO_ROOT = Path(__file__).parent.parent.parent.parent
@ -146,23 +153,7 @@ class StackRun(Subcommand):
# using the current environment packages. # using the current environment packages.
if not image_type and not image_name: if not image_type and not image_name:
logger.info("No image type or image name provided. Assuming environment packages.") logger.info("No image type or image name provided. Assuming environment packages.")
from llama_stack.core.server.server import main as server_main self._uvicorn_run(config_file, args)
# Build the server args from the current args passed to the CLI
server_args = argparse.Namespace()
for arg in vars(args):
# If this is a function, avoid passing it
# "args" contains:
# func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
if callable(getattr(args, arg)):
continue
if arg == "config":
server_args.config = str(config_file)
else:
setattr(server_args, arg, getattr(args, arg))
# Run the server
server_main(server_args)
else: else:
run_args = formulate_run_args(image_type, image_name) run_args = formulate_run_args(image_type, image_name)
@ -184,6 +175,76 @@ class StackRun(Subcommand):
run_command(run_args) run_command(run_args)
def _uvicorn_run(self, config_file: Path | None, args: argparse.Namespace) -> None:
if not config_file:
self.parser.error("Config file is required")
# Set environment variables if provided
if args.env:
for env_pair in args.env:
try:
key, value = validate_env_pair(env_pair)
logger.info(f"Setting environment variable {key} => {value}")
os.environ[key] = value
except ValueError as e:
logger.error(f"Error: {str(e)}")
self.parser.error(f"Invalid environment variable format: {env_pair}")
config_file = resolve_config_or_distro(str(config_file), Mode.RUN)
with open(config_file) as fp:
config_contents = yaml.safe_load(fp)
if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
logger_config = LoggingConfig(**cfg)
else:
logger_config = None
config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
port = args.port or config.server.port
host = config.server.host or ["::", "0.0.0.0"]
# Set the config file in environment so create_app can find it
os.environ["LLAMA_STACK_CONFIG"] = str(config_file)
uvicorn_config = {
"factory": True,
"host": host,
"port": port,
"lifespan": "on",
"log_level": logger.getEffectiveLevel(),
"log_config": logger_config,
}
keyfile = config.server.tls_keyfile
certfile = config.server.tls_certfile
if keyfile and certfile:
uvicorn_config["ssl_keyfile"] = config.server.tls_keyfile
uvicorn_config["ssl_certfile"] = config.server.tls_certfile
if config.server.tls_cafile:
uvicorn_config["ssl_ca_certs"] = config.server.tls_cafile
uvicorn_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
logger.info(
f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}\n CA: {config.server.tls_cafile}"
)
else:
logger.info(f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}")
logger.info(f"Listening on {host}:{port}")
# We need to catch KeyboardInterrupt because uvicorn's signal handling
# re-raises SIGINT signals using signal.raise_signal(), which Python
# converts to KeyboardInterrupt. Without this catch, we'd get a confusing
# stack trace when using Ctrl+C or kill -2 (SIGINT).
# SIGTERM (kill -15) works fine without this because Python doesn't
# have a default handler for it.
#
# Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
# signal handling but this is quite intrusive and not worth the effort.
try:
uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)
except (KeyboardInterrupt, SystemExit):
logger.info("Received interrupt signal, shutting down gracefully...")
def _start_ui_development_server(self, stack_server_port: int): def _start_ui_development_server(self, stack_server_port: int):
logger.info("Attempting to start UI development server...") logger.info("Attempting to start UI development server...")
# Check if npm is available # Check if npm is available

View file

@ -324,14 +324,14 @@ fi
RUN pip uninstall -y uv RUN pip uninstall -y uv
EOF EOF
# If a run config is provided, we use the --config flag # If a run config is provided, we use the llama stack CLI
if [[ -n "$run_config" ]]; then if [[ -n "$run_config" ]]; then
add_to_container << EOF add_to_container << EOF
ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$RUN_CONFIG_PATH"] ENTRYPOINT ["llama", "stack", "run", "$RUN_CONFIG_PATH"]
EOF EOF
elif [[ "$distro_or_config" != *.yaml ]]; then elif [[ "$distro_or_config" != *.yaml ]]; then
add_to_container << EOF add_to_container << EOF
ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$distro_or_config"] ENTRYPOINT ["llama", "stack", "run", "$distro_or_config"]
EOF EOF
fi fi

View file

@ -243,6 +243,7 @@ def get_external_providers_from_module(
spec = module.get_provider_spec() spec = module.get_provider_spec()
else: else:
# pass in a partially filled out provider spec to satisfy the registry -- knowing we will be overwriting it later upon build and run # pass in a partially filled out provider spec to satisfy the registry -- knowing we will be overwriting it later upon build and run
# in the case we are building we CANNOT import this module of course because it has not been installed.
spec = ProviderSpec( spec = ProviderSpec(
api=Api(provider_api), api=Api(provider_api),
provider_type=provider.provider_type, provider_type=provider.provider_type,
@ -251,9 +252,20 @@ def get_external_providers_from_module(
config_class="", config_class="",
) )
provider_type = provider.provider_type provider_type = provider.provider_type
# in the case we are building we CANNOT import this module of course because it has not been installed. if isinstance(spec, list):
# return a partially filled out spec that the build script will populate. # optionally allow people to pass inline and remote provider specs as a returned list.
registry[Api(provider_api)][provider_type] = spec # with the old method, users could pass in directories of specs using overlapping code
# we want to ensure we preserve that flexibility in this method.
logger.info(
f"Detected a list of external provider specs from {provider.module} adding all to the registry"
)
for provider_spec in spec:
if provider_spec.provider_type != provider.provider_type:
continue
logger.info(f"Adding {provider.provider_type} to registry")
registry[Api(provider_api)][provider.provider_type] = provider_spec
else:
registry[Api(provider_api)][provider_type] = spec
except ModuleNotFoundError as exc: except ModuleNotFoundError as exc:
raise ValueError( raise ValueError(
"get_provider_spec not found. If specifying an external provider via `module` in the Provider spec, the Provider must have the `provider.get_provider_spec` module available" "get_provider_spec not found. If specifying an external provider via `module` in the Provider spec, the Provider must have the `provider.get_provider_spec` module available"

View file

@ -374,6 +374,10 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
body = options.params or {} body = options.params or {}
body |= options.json_data or {} body |= options.json_data or {}
# Merge extra_json parameters (extra_body from SDK is converted to extra_json)
if hasattr(options, "extra_json") and options.extra_json:
body |= options.extra_json
matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls) matched_func, path_params, route_path, webmethod = find_matching_route(options.method, path, self.route_impls)
body |= path_params body |= path_params

View file

@ -4,7 +4,6 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import argparse
import asyncio import asyncio
import concurrent.futures import concurrent.futures
import functools import functools
@ -12,7 +11,6 @@ import inspect
import json import json
import logging # allow-direct-logging import logging # allow-direct-logging
import os import os
import ssl
import sys import sys
import traceback import traceback
import warnings import warnings
@ -35,7 +33,6 @@ from pydantic import BaseModel, ValidationError
from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
from llama_stack.apis.common.responses import PaginatedResponse from llama_stack.apis.common.responses import PaginatedResponse
from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
from llama_stack.core.access_control.access_control import AccessDeniedError from llama_stack.core.access_control.access_control import AccessDeniedError
from llama_stack.core.datatypes import ( from llama_stack.core.datatypes import (
AuthenticationRequiredError, AuthenticationRequiredError,
@ -55,7 +52,6 @@ from llama_stack.core.stack import (
Stack, Stack,
cast_image_name_to_string, cast_image_name_to_string,
replace_env_vars, replace_env_vars,
validate_env_pair,
) )
from llama_stack.core.utils.config import redact_sensitive_fields from llama_stack.core.utils.config import redact_sensitive_fields
from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
@ -333,23 +329,18 @@ class ClientVersionMiddleware:
return await self.app(scope, receive, send) return await self.app(scope, receive, send)
def create_app( def create_app() -> StackApp:
config_file: str | None = None,
env_vars: list[str] | None = None,
) -> StackApp:
"""Create and configure the FastAPI application. """Create and configure the FastAPI application.
Args: This factory function reads configuration from environment variables:
config_file: Path to config file. If None, uses LLAMA_STACK_CONFIG env var or default resolution. - LLAMA_STACK_CONFIG: Path to config file (required)
env_vars: List of environment variables in KEY=value format.
disable_version_check: Whether to disable version checking. If None, uses LLAMA_STACK_DISABLE_VERSION_CHECK env var.
Returns: Returns:
Configured StackApp instance. Configured StackApp instance.
""" """
config_file = config_file or os.getenv("LLAMA_STACK_CONFIG") config_file = os.getenv("LLAMA_STACK_CONFIG")
if config_file is None: if config_file is None:
raise ValueError("No config file provided and LLAMA_STACK_CONFIG env var is not set") raise ValueError("LLAMA_STACK_CONFIG environment variable is required")
config_file = resolve_config_or_distro(config_file, Mode.RUN) config_file = resolve_config_or_distro(config_file, Mode.RUN)
@ -361,16 +352,6 @@ def create_app(
logger_config = LoggingConfig(**cfg) logger_config = LoggingConfig(**cfg)
logger = get_logger(name=__name__, category="core::server", config=logger_config) logger = get_logger(name=__name__, category="core::server", config=logger_config)
if env_vars:
for env_pair in env_vars:
try:
key, value = validate_env_pair(env_pair)
logger.info(f"Setting environment variable {key} => {value}")
os.environ[key] = value
except ValueError as e:
logger.error(f"Error: {str(e)}")
raise ValueError(f"Invalid environment variable format: {env_pair}") from e
config = replace_env_vars(config_contents) config = replace_env_vars(config_contents)
config = StackRunConfig(**cast_image_name_to_string(config)) config = StackRunConfig(**cast_image_name_to_string(config))
@ -494,101 +475,6 @@ def create_app(
return app return app
def main(args: argparse.Namespace | None = None):
"""Start the LlamaStack server."""
parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
add_config_distro_args(parser)
parser.add_argument(
"--port",
type=int,
default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
help="Port to listen on",
)
parser.add_argument(
"--env",
action="append",
help="Environment variables in KEY=value format. Can be specified multiple times.",
)
# Determine whether the server args are being passed by the "run" command, if this is the case
# the args will be passed as a Namespace object to the main function, otherwise they will be
# parsed from the command line
if args is None:
args = parser.parse_args()
config_or_distro = get_config_from_args(args)
try:
app = create_app(
config_file=config_or_distro,
env_vars=args.env,
)
except Exception as e:
logger.error(f"Error creating app: {str(e)}")
sys.exit(1)
config_file = resolve_config_or_distro(config_or_distro, Mode.RUN)
with open(config_file) as fp:
config_contents = yaml.safe_load(fp)
if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
logger_config = LoggingConfig(**cfg)
else:
logger_config = None
config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
import uvicorn
# Configure SSL if certificates are provided
port = args.port or config.server.port
ssl_config = None
keyfile = config.server.tls_keyfile
certfile = config.server.tls_certfile
if keyfile and certfile:
ssl_config = {
"ssl_keyfile": keyfile,
"ssl_certfile": certfile,
}
if config.server.tls_cafile:
ssl_config["ssl_ca_certs"] = config.server.tls_cafile
ssl_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
logger.info(
f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}\n CA: {config.server.tls_cafile}"
)
else:
logger.info(f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}")
listen_host = config.server.host or ["::", "0.0.0.0"]
logger.info(f"Listening on {listen_host}:{port}")
uvicorn_config = {
"app": app,
"host": listen_host,
"port": port,
"lifespan": "on",
"log_level": logger.getEffectiveLevel(),
"log_config": logger_config,
}
if ssl_config:
uvicorn_config.update(ssl_config)
# We need to catch KeyboardInterrupt because uvicorn's signal handling
# re-raises SIGINT signals using signal.raise_signal(), which Python
# converts to KeyboardInterrupt. Without this catch, we'd get a confusing
# stack trace when using Ctrl+C or kill -2 (SIGINT).
# SIGTERM (kill -15) works fine without this because Python doesn't
# have a default handler for it.
#
# Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
# signal handling but this is quite intrusive and not worth the effort.
try:
asyncio.run(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
except (KeyboardInterrupt, SystemExit):
logger.info("Received interrupt signal, shutting down gracefully...")
def _log_run_config(run_config: StackRunConfig): def _log_run_config(run_config: StackRunConfig):
"""Logs the run config with redacted fields and disabled providers removed.""" """Logs the run config with redacted fields and disabled providers removed."""
logger.info("Run configuration:") logger.info("Run configuration:")
@ -615,7 +501,3 @@ def remove_disabled_providers(obj):
return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None] return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None]
else: else:
return obj return obj
if __name__ == "__main__":
main()

View file

@ -116,7 +116,7 @@ if [[ "$env_type" == "venv" ]]; then
yaml_config_arg="" yaml_config_arg=""
fi fi
$PYTHON_BINARY -m llama_stack.core.server.server \ llama stack run \
$yaml_config_arg \ $yaml_config_arg \
--port "$port" \ --port "$port" \
$env_vars \ $env_vars \

View file

@ -9,7 +9,7 @@ from pathlib import Path
from llama_stack.log import get_logger from llama_stack.log import get_logger
logger = get_logger(__name__, "tokenizer_utils") logger = get_logger(__name__, "models")
def load_bpe_file(model_path: Path) -> dict[bytes, int]: def load_bpe_file(model_path: Path) -> dict[bytes, int]:

View file

@ -329,6 +329,7 @@ class MetaReferenceAgentsImpl(Agents):
tools: list[OpenAIResponseInputTool] | None = None, tools: list[OpenAIResponseInputTool] | None = None,
include: list[str] | None = None, include: list[str] | None = None,
max_infer_iters: int | None = 10, max_infer_iters: int | None = 10,
shields: list | None = None,
) -> OpenAIResponseObject: ) -> OpenAIResponseObject:
return await self.openai_responses_impl.create_openai_response( return await self.openai_responses_impl.create_openai_response(
input, input,
@ -342,6 +343,7 @@ class MetaReferenceAgentsImpl(Agents):
tools, tools,
include, include,
max_infer_iters, max_infer_iters,
shields,
) )
async def list_openai_responses( async def list_openai_responses(

View file

@ -208,10 +208,15 @@ class OpenAIResponsesImpl:
tools: list[OpenAIResponseInputTool] | None = None, tools: list[OpenAIResponseInputTool] | None = None,
include: list[str] | None = None, include: list[str] | None = None,
max_infer_iters: int | None = 10, max_infer_iters: int | None = 10,
shields: list | None = None,
): ):
stream = bool(stream) stream = bool(stream)
text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text
# Shields parameter received via extra_body - not yet implemented
if shields is not None:
raise NotImplementedError("Shields parameter is not yet implemented in the meta-reference provider")
stream_gen = self._create_streaming_response( stream_gen = self._create_streaming_response(
input=input, input=input,
model=model, model=model,

View file

@ -52,9 +52,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter_type="cerebras", adapter_type="cerebras",
provider_type="remote::cerebras", provider_type="remote::cerebras",
pip_packages=[ pip_packages=[],
"cerebras_cloud_sdk",
],
module="llama_stack.providers.remote.inference.cerebras", module="llama_stack.providers.remote.inference.cerebras",
config_class="llama_stack.providers.remote.inference.cerebras.CerebrasImplConfig", config_class="llama_stack.providers.remote.inference.cerebras.CerebrasImplConfig",
description="Cerebras inference provider for running models on Cerebras Cloud platform.", description="Cerebras inference provider for running models on Cerebras Cloud platform.",
@ -169,7 +167,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter_type="openai", adapter_type="openai",
provider_type="remote::openai", provider_type="remote::openai",
pip_packages=["litellm"], pip_packages=[],
module="llama_stack.providers.remote.inference.openai", module="llama_stack.providers.remote.inference.openai",
config_class="llama_stack.providers.remote.inference.openai.OpenAIConfig", config_class="llama_stack.providers.remote.inference.openai.OpenAIConfig",
provider_data_validator="llama_stack.providers.remote.inference.openai.config.OpenAIProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.openai.config.OpenAIProviderDataValidator",
@ -179,7 +177,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter_type="anthropic", adapter_type="anthropic",
provider_type="remote::anthropic", provider_type="remote::anthropic",
pip_packages=["litellm"], pip_packages=["anthropic"],
module="llama_stack.providers.remote.inference.anthropic", module="llama_stack.providers.remote.inference.anthropic",
config_class="llama_stack.providers.remote.inference.anthropic.AnthropicConfig", config_class="llama_stack.providers.remote.inference.anthropic.AnthropicConfig",
provider_data_validator="llama_stack.providers.remote.inference.anthropic.config.AnthropicProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.anthropic.config.AnthropicProviderDataValidator",
@ -189,9 +187,7 @@ def available_providers() -> list[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter_type="gemini", adapter_type="gemini",
provider_type="remote::gemini", provider_type="remote::gemini",
pip_packages=[ pip_packages=[],
"litellm",
],
module="llama_stack.providers.remote.inference.gemini", module="llama_stack.providers.remote.inference.gemini",
config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig", config_class="llama_stack.providers.remote.inference.gemini.GeminiConfig",
provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.gemini.config.GeminiProviderDataValidator",
@ -202,7 +198,6 @@ def available_providers() -> list[ProviderSpec]:
adapter_type="vertexai", adapter_type="vertexai",
provider_type="remote::vertexai", provider_type="remote::vertexai",
pip_packages=[ pip_packages=[
"litellm",
"google-cloud-aiplatform", "google-cloud-aiplatform",
], ],
module="llama_stack.providers.remote.inference.vertexai", module="llama_stack.providers.remote.inference.vertexai",
@ -233,9 +228,7 @@ Available Models:
api=Api.inference, api=Api.inference,
adapter_type="groq", adapter_type="groq",
provider_type="remote::groq", provider_type="remote::groq",
pip_packages=[ pip_packages=[],
"litellm",
],
module="llama_stack.providers.remote.inference.groq", module="llama_stack.providers.remote.inference.groq",
config_class="llama_stack.providers.remote.inference.groq.GroqConfig", config_class="llama_stack.providers.remote.inference.groq.GroqConfig",
provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator",
@ -245,7 +238,7 @@ Available Models:
api=Api.inference, api=Api.inference,
adapter_type="llama-openai-compat", adapter_type="llama-openai-compat",
provider_type="remote::llama-openai-compat", provider_type="remote::llama-openai-compat",
pip_packages=["litellm"], pip_packages=[],
module="llama_stack.providers.remote.inference.llama_openai_compat", module="llama_stack.providers.remote.inference.llama_openai_compat",
config_class="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaCompatConfig", config_class="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaCompatConfig",
provider_data_validator="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.llama_openai_compat.config.LlamaProviderDataValidator",
@ -255,9 +248,7 @@ Available Models:
api=Api.inference, api=Api.inference,
adapter_type="sambanova", adapter_type="sambanova",
provider_type="remote::sambanova", provider_type="remote::sambanova",
pip_packages=[ pip_packages=[],
"litellm",
],
module="llama_stack.providers.remote.inference.sambanova", module="llama_stack.providers.remote.inference.sambanova",
config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig", config_class="llama_stack.providers.remote.inference.sambanova.SambaNovaImplConfig",
provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.sambanova.config.SambaNovaProviderDataValidator",
@ -287,7 +278,7 @@ Available Models:
api=Api.inference, api=Api.inference,
provider_type="remote::azure", provider_type="remote::azure",
adapter_type="azure", adapter_type="azure",
pip_packages=["litellm"], pip_packages=[],
module="llama_stack.providers.remote.inference.azure", module="llama_stack.providers.remote.inference.azure",
config_class="llama_stack.providers.remote.inference.azure.AzureConfig", config_class="llama_stack.providers.remote.inference.azure.AzureConfig",
provider_data_validator="llama_stack.providers.remote.inference.azure.config.AzureProviderDataValidator", provider_data_validator="llama_stack.providers.remote.inference.azure.config.AzureProviderDataValidator",

View file

@ -10,6 +10,6 @@ from .config import AnthropicConfig
async def get_adapter_impl(config: AnthropicConfig, _deps): async def get_adapter_impl(config: AnthropicConfig, _deps):
from .anthropic import AnthropicInferenceAdapter from .anthropic import AnthropicInferenceAdapter
impl = AnthropicInferenceAdapter(config) impl = AnthropicInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -4,13 +4,19 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin from collections.abc import Iterable
from anthropic import AsyncAnthropic
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import AnthropicConfig from .config import AnthropicConfig
class AnthropicInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): class AnthropicInferenceAdapter(OpenAIMixin):
config: AnthropicConfig
provider_data_api_key_field: str = "anthropic_api_key"
# source: https://docs.claude.com/en/docs/build-with-claude/embeddings # source: https://docs.claude.com/en/docs/build-with-claude/embeddings
# TODO: add support for voyageai, which is where these models are hosted # TODO: add support for voyageai, which is where these models are hosted
# embedding_model_metadata = { # embedding_model_metadata = {
@ -23,22 +29,11 @@ class AnthropicInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
# "voyage-multimodal-3": {"embedding_dimension": 1024, "context_length": 32000}, # "voyage-multimodal-3": {"embedding_dimension": 1024, "context_length": 32000},
# } # }
def __init__(self, config: AnthropicConfig) -> None: def get_api_key(self) -> str:
LiteLLMOpenAIMixin.__init__( return self.config.api_key or ""
self,
litellm_provider_name="anthropic",
api_key_from_config=config.api_key,
provider_data_api_key_field="anthropic_api_key",
)
self.config = config
async def initialize(self) -> None:
await super().initialize()
async def shutdown(self) -> None:
await super().shutdown()
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self): def get_base_url(self):
return "https://api.anthropic.com/v1" return "https://api.anthropic.com/v1"
async def list_provider_model_ids(self) -> Iterable[str]:
return [m.id async for m in AsyncAnthropic(api_key=self.get_api_key()).models.list()]

View file

@ -10,6 +10,6 @@ from .config import AzureConfig
async def get_adapter_impl(config: AzureConfig, _deps): async def get_adapter_impl(config: AzureConfig, _deps):
from .azure import AzureInferenceAdapter from .azure import AzureInferenceAdapter
impl = AzureInferenceAdapter(config) impl = AzureInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -4,31 +4,20 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import Any
from urllib.parse import urljoin from urllib.parse import urljoin
from llama_stack.apis.inference import ChatCompletionRequest
from llama_stack.providers.utils.inference.litellm_openai_mixin import (
LiteLLMOpenAIMixin,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import AzureConfig from .config import AzureConfig
class AzureInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): class AzureInferenceAdapter(OpenAIMixin):
def __init__(self, config: AzureConfig) -> None: config: AzureConfig
LiteLLMOpenAIMixin.__init__(
self,
litellm_provider_name="azure",
api_key_from_config=config.api_key.get_secret_value(),
provider_data_api_key_field="azure_api_key",
openai_compat_api_base=str(config.api_base),
)
self.config = config
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin provider_data_api_key_field: str = "azure_api_key"
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_api_key(self) -> str:
return self.config.api_key.get_secret_value()
def get_base_url(self) -> str: def get_base_url(self) -> str:
""" """
@ -37,26 +26,3 @@ class AzureInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
Returns the Azure API base URL from the configuration. Returns the Azure API base URL from the configuration.
""" """
return urljoin(str(self.config.api_base), "/openai/v1") return urljoin(str(self.config.api_base), "/openai/v1")
async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
# Get base parameters from parent
params = await super()._get_params(request)
# Add Azure specific parameters
provider_data = self.get_request_provider_data()
if provider_data:
if getattr(provider_data, "azure_api_key", None):
params["api_key"] = provider_data.azure_api_key
if getattr(provider_data, "azure_api_base", None):
params["api_base"] = provider_data.azure_api_base
if getattr(provider_data, "azure_api_version", None):
params["api_version"] = provider_data.azure_api_version
if getattr(provider_data, "azure_api_type", None):
params["api_type"] = provider_data.azure_api_type
else:
params["api_key"] = self.config.api_key.get_secret_value()
params["api_base"] = str(self.config.api_base)
params["api_version"] = self.config.api_version
params["api_type"] = self.config.api_type
return params

View file

@ -12,7 +12,7 @@ async def get_adapter_impl(config: CerebrasImplConfig, _deps):
assert isinstance(config, CerebrasImplConfig), f"Unexpected config type: {type(config)}" assert isinstance(config, CerebrasImplConfig), f"Unexpected config type: {type(config)}"
impl = CerebrasInferenceAdapter(config) impl = CerebrasInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()

View file

@ -6,39 +6,14 @@
from urllib.parse import urljoin from urllib.parse import urljoin
from cerebras.cloud.sdk import AsyncCerebras from llama_stack.apis.inference import OpenAIEmbeddingsResponse
from llama_stack.apis.inference import (
ChatCompletionRequest,
CompletionRequest,
Inference,
OpenAIEmbeddingsResponse,
TopKSamplingStrategy,
)
from llama_stack.providers.utils.inference.openai_compat import (
get_sampling_options,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
completion_request_to_prompt,
)
from .config import CerebrasImplConfig from .config import CerebrasImplConfig
class CerebrasInferenceAdapter( class CerebrasInferenceAdapter(OpenAIMixin):
OpenAIMixin, config: CerebrasImplConfig
Inference,
):
def __init__(self, config: CerebrasImplConfig) -> None:
self.config = config
# TODO: make this use provider data, etc. like other providers
self._cerebras_client = AsyncCerebras(
base_url=self.config.base_url,
api_key=self.config.api_key.get_secret_value(),
)
def get_api_key(self) -> str: def get_api_key(self) -> str:
return self.config.api_key.get_secret_value() return self.config.api_key.get_secret_value()
@ -46,31 +21,6 @@ class CerebrasInferenceAdapter(
def get_base_url(self) -> str: def get_base_url(self) -> str:
return urljoin(self.config.base_url, "v1") return urljoin(self.config.base_url, "v1")
async def initialize(self) -> None:
return
async def shutdown(self) -> None:
pass
async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
if request.sampling_params and isinstance(request.sampling_params.strategy, TopKSamplingStrategy):
raise ValueError("`top_k` not supported by Cerebras")
prompt = ""
if isinstance(request, ChatCompletionRequest):
prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model))
elif isinstance(request, CompletionRequest):
prompt = await completion_request_to_prompt(request)
else:
raise ValueError(f"Unknown request type {type(request)}")
return {
"model": request.model,
"prompt": prompt,
"stream": request.stream,
**get_sampling_options(request.sampling_params),
}
async def openai_embeddings( async def openai_embeddings(
self, self,
model: str, model: str,

View file

@ -22,7 +22,7 @@ class CerebrasImplConfig(RemoteInferenceProviderConfig):
description="Base URL for the Cerebras API", description="Base URL for the Cerebras API",
) )
api_key: SecretStr = Field( api_key: SecretStr = Field(
default=SecretStr(os.environ.get("CEREBRAS_API_KEY")), default=SecretStr(os.environ.get("CEREBRAS_API_KEY")), # type: ignore[arg-type]
description="Cerebras API Key", description="Cerebras API Key",
) )

View file

@ -11,6 +11,6 @@ async def get_adapter_impl(config: DatabricksImplConfig, _deps):
from .databricks import DatabricksInferenceAdapter from .databricks import DatabricksInferenceAdapter
assert isinstance(config, DatabricksImplConfig), f"Unexpected config type: {type(config)}" assert isinstance(config, DatabricksImplConfig), f"Unexpected config type: {type(config)}"
impl = DatabricksInferenceAdapter(config) impl = DatabricksInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -14,12 +14,12 @@ from llama_stack.schema_utils import json_schema_type
@json_schema_type @json_schema_type
class DatabricksImplConfig(RemoteInferenceProviderConfig): class DatabricksImplConfig(RemoteInferenceProviderConfig):
url: str = Field( url: str | None = Field(
default=None, default=None,
description="The URL for the Databricks model serving endpoint", description="The URL for the Databricks model serving endpoint",
) )
api_token: SecretStr = Field( api_token: SecretStr = Field(
default=SecretStr(None), default=SecretStr(None), # type: ignore[arg-type]
description="The Databricks API token", description="The Databricks API token",
) )

View file

@ -4,16 +4,12 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from collections.abc import Iterable
from typing import Any from typing import Any
from databricks.sdk import WorkspaceClient from databricks.sdk import WorkspaceClient
from llama_stack.apis.inference import ( from llama_stack.apis.inference import OpenAICompletion
Inference,
Model,
OpenAICompletion,
)
from llama_stack.apis.models import ModelType
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
@ -22,30 +18,31 @@ from .config import DatabricksImplConfig
logger = get_logger(name=__name__, category="inference::databricks") logger = get_logger(name=__name__, category="inference::databricks")
class DatabricksInferenceAdapter( class DatabricksInferenceAdapter(OpenAIMixin):
OpenAIMixin, config: DatabricksImplConfig
Inference,
):
# source: https://docs.databricks.com/aws/en/machine-learning/foundation-model-apis/supported-models # source: https://docs.databricks.com/aws/en/machine-learning/foundation-model-apis/supported-models
embedding_model_metadata = { embedding_model_metadata: dict[str, dict[str, int]] = {
"databricks-gte-large-en": {"embedding_dimension": 1024, "context_length": 8192}, "databricks-gte-large-en": {"embedding_dimension": 1024, "context_length": 8192},
"databricks-bge-large-en": {"embedding_dimension": 1024, "context_length": 512}, "databricks-bge-large-en": {"embedding_dimension": 1024, "context_length": 512},
} }
def __init__(self, config: DatabricksImplConfig) -> None:
self.config = config
def get_api_key(self) -> str: def get_api_key(self) -> str:
return self.config.api_token.get_secret_value() return self.config.api_token.get_secret_value()
def get_base_url(self) -> str: def get_base_url(self) -> str:
return f"{self.config.url}/serving-endpoints" return f"{self.config.url}/serving-endpoints"
async def initialize(self) -> None: async def list_provider_model_ids(self) -> Iterable[str]:
return return [
endpoint.name
for endpoint in WorkspaceClient(
host=self.config.url, token=self.get_api_key()
).serving_endpoints.list() # TODO: this is not async
]
async def shutdown(self) -> None: async def should_refresh_models(self) -> bool:
pass return False
async def openai_completion( async def openai_completion(
self, self,
@ -71,32 +68,3 @@ class DatabricksInferenceAdapter(
suffix: str | None = None, suffix: str | None = None,
) -> OpenAICompletion: ) -> OpenAICompletion:
raise NotImplementedError() raise NotImplementedError()
async def list_models(self) -> list[Model] | None:
self._model_cache = {} # from OpenAIMixin
ws_client = WorkspaceClient(host=self.config.url, token=self.get_api_key()) # TODO: this is not async
endpoints = ws_client.serving_endpoints.list()
for endpoint in endpoints:
model = Model(
provider_id=self.__provider_id__,
provider_resource_id=endpoint.name,
identifier=endpoint.name,
)
if endpoint.task == "llm/v1/chat":
model.model_type = ModelType.llm # this is redundant, but informative
elif endpoint.task == "llm/v1/embeddings":
if endpoint.name not in self.embedding_model_metadata:
logger.warning(f"No metadata information available for embedding model {endpoint.name}, skipping.")
continue
model.model_type = ModelType.embedding
model.metadata = self.embedding_model_metadata[endpoint.name]
else:
logger.warning(f"Unknown model type, skipping: {endpoint}")
continue
self._model_cache[endpoint.name] = model
return list(self._model_cache.values())
async def should_refresh_models(self) -> bool:
return False

View file

@ -17,6 +17,6 @@ async def get_adapter_impl(config: FireworksImplConfig, _deps):
from .fireworks import FireworksInferenceAdapter from .fireworks import FireworksInferenceAdapter
assert isinstance(config, FireworksImplConfig), f"Unexpected config type: {type(config)}" assert isinstance(config, FireworksImplConfig), f"Unexpected config type: {type(config)}"
impl = FireworksInferenceAdapter(config) impl = FireworksInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -5,124 +5,26 @@
# the root directory of this source tree. # the root directory of this source tree.
from fireworks.client import Fireworks
from llama_stack.apis.inference import (
ChatCompletionRequest,
Inference,
LogProbConfig,
ResponseFormat,
ResponseFormatType,
SamplingParams,
)
from llama_stack.core.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
)
from llama_stack.providers.utils.inference.openai_compat import (
convert_message_to_openai_dict,
get_sampling_options,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
request_has_media,
)
from .config import FireworksImplConfig from .config import FireworksImplConfig
logger = get_logger(name=__name__, category="inference::fireworks") logger = get_logger(name=__name__, category="inference::fireworks")
class FireworksInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData): class FireworksInferenceAdapter(OpenAIMixin):
embedding_model_metadata = { config: FireworksImplConfig
embedding_model_metadata: dict[str, dict[str, int]] = {
"nomic-ai/nomic-embed-text-v1.5": {"embedding_dimension": 768, "context_length": 8192}, "nomic-ai/nomic-embed-text-v1.5": {"embedding_dimension": 768, "context_length": 8192},
"accounts/fireworks/models/qwen3-embedding-8b": {"embedding_dimension": 4096, "context_length": 40960}, "accounts/fireworks/models/qwen3-embedding-8b": {"embedding_dimension": 4096, "context_length": 40960},
} }
def __init__(self, config: FireworksImplConfig) -> None: provider_data_api_key_field: str = "fireworks_api_key"
ModelRegistryHelper.__init__(self)
self.config = config
self.allowed_models = config.allowed_models
async def initialize(self) -> None:
pass
async def shutdown(self) -> None:
pass
def get_api_key(self) -> str: def get_api_key(self) -> str:
config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None return self.config.api_key.get_secret_value() if self.config.api_key else None # type: ignore[return-value]
if config_api_key:
return config_api_key
else:
provider_data = self.get_request_provider_data()
if provider_data is None or not provider_data.fireworks_api_key:
raise ValueError(
'Pass Fireworks API Key in the header X-LlamaStack-Provider-Data as { "fireworks_api_key": <your api key>}'
)
return provider_data.fireworks_api_key
def get_base_url(self) -> str: def get_base_url(self) -> str:
return "https://api.fireworks.ai/inference/v1" return "https://api.fireworks.ai/inference/v1"
def _get_client(self) -> Fireworks:
fireworks_api_key = self.get_api_key()
return Fireworks(api_key=fireworks_api_key)
def _build_options(
self,
sampling_params: SamplingParams | None,
fmt: ResponseFormat | None,
logprobs: LogProbConfig | None,
) -> dict:
options = get_sampling_options(sampling_params)
options.setdefault("max_tokens", 512)
if fmt:
if fmt.type == ResponseFormatType.json_schema.value:
options["response_format"] = {
"type": "json_object",
"schema": fmt.json_schema,
}
elif fmt.type == ResponseFormatType.grammar.value:
options["response_format"] = {
"type": "grammar",
"grammar": fmt.bnf,
}
else:
raise ValueError(f"Unknown response format {fmt.type}")
if logprobs and logprobs.top_k:
options["logprobs"] = logprobs.top_k
if options["logprobs"] <= 0 or options["logprobs"] >= 5:
raise ValueError("Required range: 0 < top_k < 5")
return options
async def _get_params(self, request: ChatCompletionRequest) -> dict:
input_dict = {}
media_present = request_has_media(request)
llama_model = self.get_llama_model(request.model)
# TODO: tools are never added to the request, so we need to add them here
if media_present or not llama_model:
input_dict["messages"] = [await convert_message_to_openai_dict(m, download=True) for m in request.messages]
else:
input_dict["prompt"] = await chat_completion_request_to_prompt(request, llama_model)
# Fireworks always prepends with BOS
if "prompt" in input_dict:
if input_dict["prompt"].startswith("<|begin_of_text|>"):
input_dict["prompt"] = input_dict["prompt"][len("<|begin_of_text|>") :]
params = {
"model": request.model,
**input_dict,
"stream": bool(request.stream),
**self._build_options(request.sampling_params, request.response_format, request.logprobs),
}
logger.debug(f"params to fireworks: {params}")
return params

View file

@ -10,6 +10,6 @@ from .config import GeminiConfig
async def get_adapter_impl(config: GeminiConfig, _deps): async def get_adapter_impl(config: GeminiConfig, _deps):
from .gemini import GeminiInferenceAdapter from .gemini import GeminiInferenceAdapter
impl = GeminiInferenceAdapter(config) impl = GeminiInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -4,33 +4,21 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import GeminiConfig from .config import GeminiConfig
class GeminiInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): class GeminiInferenceAdapter(OpenAIMixin):
embedding_model_metadata = { config: GeminiConfig
provider_data_api_key_field: str = "gemini_api_key"
embedding_model_metadata: dict[str, dict[str, int]] = {
"text-embedding-004": {"embedding_dimension": 768, "context_length": 2048}, "text-embedding-004": {"embedding_dimension": 768, "context_length": 2048},
} }
def __init__(self, config: GeminiConfig) -> None: def get_api_key(self) -> str:
LiteLLMOpenAIMixin.__init__( return self.config.api_key or ""
self,
litellm_provider_name="gemini",
api_key_from_config=config.api_key,
provider_data_api_key_field="gemini_api_key",
)
self.config = config
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self): def get_base_url(self):
return "https://generativelanguage.googleapis.com/v1beta/openai/" return "https://generativelanguage.googleapis.com/v1beta/openai/"
async def initialize(self) -> None:
await super().initialize()
async def shutdown(self) -> None:
await super().shutdown()

View file

@ -11,5 +11,5 @@ async def get_adapter_impl(config: GroqConfig, _deps):
# import dynamically so the import is used only when it is needed # import dynamically so the import is used only when it is needed
from .groq import GroqInferenceAdapter from .groq import GroqInferenceAdapter
adapter = GroqInferenceAdapter(config) adapter = GroqInferenceAdapter(config=config)
return adapter return adapter

View file

@ -6,30 +6,16 @@
from llama_stack.providers.remote.inference.groq.config import GroqConfig from llama_stack.providers.remote.inference.groq.config import GroqConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
class GroqInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): class GroqInferenceAdapter(OpenAIMixin):
_config: GroqConfig config: GroqConfig
def __init__(self, config: GroqConfig): provider_data_api_key_field: str = "groq_api_key"
LiteLLMOpenAIMixin.__init__(
self,
litellm_provider_name="groq",
api_key_from_config=config.api_key,
provider_data_api_key_field="groq_api_key",
)
self.config = config
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin def get_api_key(self) -> str:
get_api_key = LiteLLMOpenAIMixin.get_api_key return self.config.api_key or ""
def get_base_url(self) -> str: def get_base_url(self) -> str:
return f"{self.config.url}/openai/v1" return f"{self.config.url}/openai/v1"
async def initialize(self):
await super().initialize()
async def shutdown(self):
await super().shutdown()

View file

@ -4,14 +4,12 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.apis.inference import InferenceProvider
from .config import LlamaCompatConfig from .config import LlamaCompatConfig
async def get_adapter_impl(config: LlamaCompatConfig, _deps) -> InferenceProvider: async def get_adapter_impl(config: LlamaCompatConfig, _deps):
# import dynamically so the import is used only when it is needed # import dynamically so the import is used only when it is needed
from .llama import LlamaCompatInferenceAdapter from .llama import LlamaCompatInferenceAdapter
adapter = LlamaCompatInferenceAdapter(config) adapter = LlamaCompatInferenceAdapter(config=config)
return adapter return adapter

View file

@ -3,40 +3,26 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import Any
from llama_stack.apis.inference.inference import OpenAICompletion, OpenAIEmbeddingsResponse
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig from llama_stack.providers.remote.inference.llama_openai_compat.config import LlamaCompatConfig
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
logger = get_logger(name=__name__, category="inference::llama_openai_compat") logger = get_logger(name=__name__, category="inference::llama_openai_compat")
class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): class LlamaCompatInferenceAdapter(OpenAIMixin):
config: LlamaCompatConfig
provider_data_api_key_field: str = "llama_api_key"
""" """
Llama API Inference Adapter for Llama Stack. Llama API Inference Adapter for Llama Stack.
Note: The inheritance order is important here. OpenAIMixin must come before
LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
is used instead of ModelRegistryHelper.check_model_availability().
- OpenAIMixin.check_model_availability() queries the Llama API to check if a model exists
- ModelRegistryHelper.check_model_availability() (inherited by LiteLLMOpenAIMixin) just returns False and shows a warning
""" """
_config: LlamaCompatConfig def get_api_key(self) -> str:
return self.config.api_key or ""
def __init__(self, config: LlamaCompatConfig):
LiteLLMOpenAIMixin.__init__(
self,
litellm_provider_name="meta_llama",
api_key_from_config=config.api_key,
provider_data_api_key_field="llama_api_key",
openai_compat_api_base=config.openai_compat_api_base,
)
self.config = config
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str: def get_base_url(self) -> str:
""" """
@ -46,8 +32,37 @@ class LlamaCompatInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
""" """
return self.config.openai_compat_api_base return self.config.openai_compat_api_base
async def initialize(self): async def openai_completion(
await super().initialize() self,
model: str,
prompt: str | list[str] | list[int] | list[list[int]],
best_of: int | None = None,
echo: bool | None = None,
frequency_penalty: float | None = None,
logit_bias: dict[str, float] | None = None,
logprobs: bool | None = None,
max_tokens: int | None = None,
n: int | None = None,
presence_penalty: float | None = None,
seed: int | None = None,
stop: str | list[str] | None = None,
stream: bool | None = None,
stream_options: dict[str, Any] | None = None,
temperature: float | None = None,
top_p: float | None = None,
user: str | None = None,
guided_choice: list[str] | None = None,
prompt_logprobs: int | None = None,
suffix: str | None = None,
) -> OpenAICompletion:
raise NotImplementedError()
async def shutdown(self): async def openai_embeddings(
await super().shutdown() self,
model: str,
input: str | list[str],
encoding_format: str | None = "float",
dimensions: int | None = None,
user: str | None = None,
) -> OpenAIEmbeddingsResponse:
raise NotImplementedError()

View file

@ -15,7 +15,8 @@ async def get_adapter_impl(config: NVIDIAConfig, _deps) -> Inference:
if not isinstance(config, NVIDIAConfig): if not isinstance(config, NVIDIAConfig):
raise RuntimeError(f"Unexpected config type: {type(config)}") raise RuntimeError(f"Unexpected config type: {type(config)}")
adapter = NVIDIAInferenceAdapter(config) adapter = NVIDIAInferenceAdapter(config=config)
await adapter.initialize()
return adapter return adapter

View file

@ -8,7 +8,6 @@
from openai import NOT_GIVEN from openai import NOT_GIVEN
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
Inference,
OpenAIEmbeddingData, OpenAIEmbeddingData,
OpenAIEmbeddingsResponse, OpenAIEmbeddingsResponse,
OpenAIEmbeddingUsage, OpenAIEmbeddingUsage,
@ -22,7 +21,9 @@ from .utils import _is_nvidia_hosted
logger = get_logger(name=__name__, category="inference::nvidia") logger = get_logger(name=__name__, category="inference::nvidia")
class NVIDIAInferenceAdapter(OpenAIMixin, Inference): class NVIDIAInferenceAdapter(OpenAIMixin):
config: NVIDIAConfig
""" """
NVIDIA Inference Adapter for Llama Stack. NVIDIA Inference Adapter for Llama Stack.
@ -37,32 +38,21 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference):
""" """
# source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html # source: https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
embedding_model_metadata = { embedding_model_metadata: dict[str, dict[str, int]] = {
"nvidia/llama-3.2-nv-embedqa-1b-v2": {"embedding_dimension": 2048, "context_length": 8192}, "nvidia/llama-3.2-nv-embedqa-1b-v2": {"embedding_dimension": 2048, "context_length": 8192},
"nvidia/nv-embedqa-e5-v5": {"embedding_dimension": 512, "context_length": 1024}, "nvidia/nv-embedqa-e5-v5": {"embedding_dimension": 512, "context_length": 1024},
"nvidia/nv-embedqa-mistral-7b-v2": {"embedding_dimension": 512, "context_length": 4096}, "nvidia/nv-embedqa-mistral-7b-v2": {"embedding_dimension": 512, "context_length": 4096},
"snowflake/arctic-embed-l": {"embedding_dimension": 512, "context_length": 1024}, "snowflake/arctic-embed-l": {"embedding_dimension": 512, "context_length": 1024},
} }
def __init__(self, config: NVIDIAConfig) -> None: async def initialize(self) -> None:
logger.info(f"Initializing NVIDIAInferenceAdapter({config.url})...") logger.info(f"Initializing NVIDIAInferenceAdapter({self.config.url})...")
if _is_nvidia_hosted(config): if _is_nvidia_hosted(self.config):
if not config.api_key: if not self.config.api_key:
raise RuntimeError( raise RuntimeError(
"API key is required for hosted NVIDIA NIM. Either provide an API key or use a self-hosted NIM." "API key is required for hosted NVIDIA NIM. Either provide an API key or use a self-hosted NIM."
) )
# elif self._config.api_key:
#
# we don't raise this warning because a user may have deployed their
# self-hosted NIM with an API key requirement.
#
# warnings.warn(
# "API key is not required for self-hosted NVIDIA NIM. "
# "Consider removing the api_key from the configuration."
# )
self._config = config
def get_api_key(self) -> str: def get_api_key(self) -> str:
""" """
@ -70,7 +60,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference):
:return: The NVIDIA API key :return: The NVIDIA API key
""" """
return self._config.api_key.get_secret_value() if self._config.api_key else "NO KEY" return self.config.api_key.get_secret_value() if self.config.api_key else "NO KEY"
def get_base_url(self) -> str: def get_base_url(self) -> str:
""" """
@ -78,7 +68,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference):
:return: The NVIDIA API base URL :return: The NVIDIA API base URL
""" """
return f"{self._config.url}/v1" if self._config.append_api_version else self._config.url return f"{self.config.url}/v1" if self.config.append_api_version else self.config.url
async def openai_embeddings( async def openai_embeddings(
self, self,

View file

@ -10,6 +10,6 @@ from .config import OllamaImplConfig
async def get_adapter_impl(config: OllamaImplConfig, _deps): async def get_adapter_impl(config: OllamaImplConfig, _deps):
from .ollama import OllamaInferenceAdapter from .ollama import OllamaInferenceAdapter
impl = OllamaInferenceAdapter(config) impl = OllamaInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -6,58 +6,29 @@
import asyncio import asyncio
from typing import Any
from ollama import AsyncClient as AsyncOllamaClient from ollama import AsyncClient as AsyncOllamaClient
from llama_stack.apis.common.content_types import (
ImageContentItem,
TextContentItem,
)
from llama_stack.apis.common.errors import UnsupportedModelError from llama_stack.apis.common.errors import UnsupportedModelError
from llama_stack.apis.inference import (
ChatCompletionRequest,
GrammarResponseFormat,
InferenceProvider,
JsonSchemaResponseFormat,
Message,
)
from llama_stack.apis.models import Model from llama_stack.apis.models import Model
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.models.llama.sku_types import CoreModelId
from llama_stack.providers.datatypes import ( from llama_stack.providers.datatypes import (
HealthResponse, HealthResponse,
HealthStatus, HealthStatus,
ModelsProtocolPrivate,
) )
from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig
from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
get_sampling_options,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
convert_image_content_to_url,
request_has_media,
)
logger = get_logger(name=__name__, category="inference::ollama") logger = get_logger(name=__name__, category="inference::ollama")
class OllamaInferenceAdapter( class OllamaInferenceAdapter(OpenAIMixin):
OpenAIMixin, config: OllamaImplConfig
ModelRegistryHelper,
InferenceProvider,
ModelsProtocolPrivate,
):
# automatically set by the resolver when instantiating the provider # automatically set by the resolver when instantiating the provider
__provider_id__: str __provider_id__: str
embedding_model_metadata = { embedding_model_metadata: dict[str, dict[str, int]] = {
"all-minilm:l6-v2": { "all-minilm:l6-v2": {
"embedding_dimension": 384, "embedding_dimension": 384,
"context_length": 512, "context_length": 512,
@ -76,29 +47,8 @@ class OllamaInferenceAdapter(
}, },
} }
def __init__(self, config: OllamaImplConfig) -> None: download_images: bool = True
# TODO: remove ModelRegistryHelper.__init__ when completion and _clients: dict[asyncio.AbstractEventLoop, AsyncOllamaClient] = {}
# chat_completion are. this exists to satisfy the input /
# output processing for llama models. specifically,
# tool_calling is handled by raw template processing,
# instead of using the /api/chat endpoint w/ tools=...
ModelRegistryHelper.__init__(
self,
model_entries=[
build_hf_repo_model_entry(
"llama3.2:3b-instruct-fp16",
CoreModelId.llama3_2_3b_instruct.value,
),
build_hf_repo_model_entry(
"llama-guard3:1b",
CoreModelId.llama_guard_3_1b.value,
),
],
)
self.config = config
# Ollama does not support image urls, so we need to download the image and convert it to base64
self.download_images = True
self._clients: dict[asyncio.AbstractEventLoop, AsyncOllamaClient] = {}
@property @property
def ollama_client(self) -> AsyncOllamaClient: def ollama_client(self) -> AsyncOllamaClient:
@ -142,50 +92,6 @@ class OllamaInferenceAdapter(
async def shutdown(self) -> None: async def shutdown(self) -> None:
self._clients.clear() self._clients.clear()
async def _get_model(self, model_id: str) -> Model:
if not self.model_store:
raise ValueError("Model store not set")
return await self.model_store.get_model(model_id)
async def _get_params(self, request: ChatCompletionRequest) -> dict:
sampling_options = get_sampling_options(request.sampling_params)
# This is needed since the Ollama API expects num_predict to be set
# for early truncation instead of max_tokens.
if sampling_options.get("max_tokens") is not None:
sampling_options["num_predict"] = sampling_options["max_tokens"]
input_dict: dict[str, Any] = {}
media_present = request_has_media(request)
llama_model = self.get_llama_model(request.model)
if media_present or not llama_model:
contents = [await convert_message_to_openai_dict_for_ollama(m) for m in request.messages]
# flatten the list of lists
input_dict["messages"] = [item for sublist in contents for item in sublist]
else:
input_dict["raw"] = True
input_dict["prompt"] = await chat_completion_request_to_prompt(
request,
llama_model,
)
if fmt := request.response_format:
if isinstance(fmt, JsonSchemaResponseFormat):
input_dict["format"] = fmt.json_schema
elif isinstance(fmt, GrammarResponseFormat):
raise NotImplementedError("Grammar response format is not supported")
else:
raise ValueError(f"Unknown response format type: {fmt.type}")
params = {
"model": request.model,
**input_dict,
"options": sampling_options,
"stream": request.stream,
}
logger.debug(f"params to ollama: {params}")
return params
async def register_model(self, model: Model) -> Model: async def register_model(self, model: Model) -> Model:
if await self.check_model_availability(model.provider_model_id): if await self.check_model_availability(model.provider_model_id):
return model return model
@ -197,24 +103,3 @@ class OllamaInferenceAdapter(
return model return model
raise UnsupportedModelError(model.provider_model_id, list(self._model_cache.keys())) raise UnsupportedModelError(model.provider_model_id, list(self._model_cache.keys()))
async def convert_message_to_openai_dict_for_ollama(message: Message) -> list[dict]:
async def _convert_content(content) -> dict:
if isinstance(content, ImageContentItem):
return {
"role": message.role,
"images": [await convert_image_content_to_url(content, download=True, include_format=False)],
}
else:
text = content.text if isinstance(content, TextContentItem) else content
assert isinstance(text, str)
return {
"role": message.role,
"content": text,
}
if isinstance(message.content, list):
return [await _convert_content(c) for c in message.content]
else:
return [await _convert_content(message.content)]

View file

@ -10,6 +10,6 @@ from .config import OpenAIConfig
async def get_adapter_impl(config: OpenAIConfig, _deps): async def get_adapter_impl(config: OpenAIConfig, _deps):
from .openai import OpenAIInferenceAdapter from .openai import OpenAIInferenceAdapter
impl = OpenAIInferenceAdapter(config) impl = OpenAIInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -5,7 +5,6 @@
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import OpenAIConfig from .config import OpenAIConfig
@ -14,52 +13,24 @@ logger = get_logger(name=__name__, category="inference::openai")
# #
# This OpenAI adapter implements Inference methods using two mixins - # This OpenAI adapter implements Inference methods using OpenAIMixin
# #
# | Inference Method | Implementation Source | class OpenAIInferenceAdapter(OpenAIMixin):
# |----------------------------|--------------------------|
# | completion | LiteLLMOpenAIMixin |
# | chat_completion | LiteLLMOpenAIMixin |
# | embedding | LiteLLMOpenAIMixin |
# | openai_completion | OpenAIMixin |
# | openai_chat_completion | OpenAIMixin |
# | openai_embeddings | OpenAIMixin |
#
class OpenAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
""" """
OpenAI Inference Adapter for Llama Stack. OpenAI Inference Adapter for Llama Stack.
Note: The inheritance order is important here. OpenAIMixin must come before
LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
is used instead of ModelRegistryHelper.check_model_availability().
- OpenAIMixin.check_model_availability() queries the OpenAI API to check if a model exists
- ModelRegistryHelper.check_model_availability() (inherited by LiteLLMOpenAIMixin) just returns False and shows a warning
""" """
embedding_model_metadata = { config: OpenAIConfig
provider_data_api_key_field: str = "openai_api_key"
embedding_model_metadata: dict[str, dict[str, int]] = {
"text-embedding-3-small": {"embedding_dimension": 1536, "context_length": 8192}, "text-embedding-3-small": {"embedding_dimension": 1536, "context_length": 8192},
"text-embedding-3-large": {"embedding_dimension": 3072, "context_length": 8192}, "text-embedding-3-large": {"embedding_dimension": 3072, "context_length": 8192},
} }
def __init__(self, config: OpenAIConfig) -> None: def get_api_key(self) -> str:
LiteLLMOpenAIMixin.__init__( return self.config.api_key or ""
self,
litellm_provider_name="openai",
api_key_from_config=config.api_key,
provider_data_api_key_field="openai_api_key",
)
self.config = config
# we set is_openai_compat so users can use the canonical
# openai model names like "gpt-4" or "gpt-3.5-turbo"
# and the model name will be translated to litellm's
# "openai/gpt-4" or "openai/gpt-3.5-turbo" transparently.
# if we do not set this, users will be exposed to the
# litellm specific model names, an abstraction leak.
self.is_openai_compat = True
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str: def get_base_url(self) -> str:
""" """
@ -68,9 +39,3 @@ class OpenAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
Returns the OpenAI API base URL from the configuration. Returns the OpenAI API base URL from the configuration.
""" """
return self.config.base_url return self.config.base_url
async def initialize(self) -> None:
await super().initialize()
async def shutdown(self) -> None:
await super().shutdown()

View file

@ -31,12 +31,6 @@ class PassthroughInferenceAdapter(Inference):
ModelRegistryHelper.__init__(self) ModelRegistryHelper.__init__(self)
self.config = config self.config = config
async def initialize(self) -> None:
pass
async def shutdown(self) -> None:
pass
async def unregister_model(self, model_id: str) -> None: async def unregister_model(self, model_id: str) -> None:
pass pass

View file

@ -53,12 +53,6 @@ class RunpodInferenceAdapter(
ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS) ModelRegistryHelper.__init__(self, stack_to_provider_models_map=RUNPOD_SUPPORTED_MODELS)
self.config = config self.config = config
async def initialize(self) -> None:
return
async def shutdown(self) -> None:
pass
def _get_params(self, request: ChatCompletionRequest) -> dict: def _get_params(self, request: ChatCompletionRequest) -> dict:
return { return {
"model": self.map_to_provider_model(request.model), "model": self.map_to_provider_model(request.model),

View file

@ -11,6 +11,6 @@ async def get_adapter_impl(config: SambaNovaImplConfig, _deps):
from .sambanova import SambaNovaInferenceAdapter from .sambanova import SambaNovaInferenceAdapter
assert isinstance(config, SambaNovaImplConfig), f"Unexpected config type: {type(config)}" assert isinstance(config, SambaNovaImplConfig), f"Unexpected config type: {type(config)}"
impl = SambaNovaInferenceAdapter(config) impl = SambaNovaInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -5,39 +5,22 @@
# the root directory of this source tree. # the root directory of this source tree.
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import SambaNovaImplConfig from .config import SambaNovaImplConfig
class SambaNovaInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): class SambaNovaInferenceAdapter(OpenAIMixin):
config: SambaNovaImplConfig
provider_data_api_key_field: str = "sambanova_api_key"
download_images: bool = True # SambaNova does not support image downloads server-size, perform them on the client
""" """
SambaNova Inference Adapter for Llama Stack. SambaNova Inference Adapter for Llama Stack.
Note: The inheritance order is important here. OpenAIMixin must come before
LiteLLMOpenAIMixin to ensure that OpenAIMixin.check_model_availability()
is used instead of LiteLLMOpenAIMixin.check_model_availability().
- OpenAIMixin.check_model_availability() queries the /v1/models to check if a model exists
- LiteLLMOpenAIMixin.check_model_availability() checks the static registry within LiteLLM
""" """
def __init__(self, config: SambaNovaImplConfig): def get_api_key(self) -> str:
self.config = config return self.config.api_key.get_secret_value() if self.config.api_key else ""
self.environment_available_models: list[str] = []
LiteLLMOpenAIMixin.__init__(
self,
litellm_provider_name="sambanova",
api_key_from_config=self.config.api_key.get_secret_value() if self.config.api_key else None,
provider_data_api_key_field="sambanova_api_key",
openai_compat_api_base=self.config.url,
download_images=True, # SambaNova requires base64 image encoding
json_schema_strict=False, # SambaNova doesn't support strict=True yet
)
# Delegate the client data handling get_api_key method to LiteLLMOpenAIMixin
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str: def get_base_url(self) -> str:
""" """

View file

@ -5,53 +5,21 @@
# the root directory of this source tree. # the root directory of this source tree.
from collections.abc import Iterable
from huggingface_hub import AsyncInferenceClient, HfApi from huggingface_hub import AsyncInferenceClient, HfApi
from pydantic import SecretStr from pydantic import SecretStr
from llama_stack.apis.inference import ( from llama_stack.apis.inference import OpenAIEmbeddingsResponse
ChatCompletionRequest,
Inference,
OpenAIEmbeddingsResponse,
ResponseFormat,
ResponseFormatType,
SamplingParams,
)
from llama_stack.apis.models import Model
from llama_stack.apis.models.models import ModelType
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.models.llama.sku_list import all_registered_models
from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
get_sampling_options,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_model_input_info,
)
from .config import InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImplConfig from .config import InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImplConfig
log = get_logger(name=__name__, category="inference::tgi") log = get_logger(name=__name__, category="inference::tgi")
def build_hf_repo_model_entries(): class _HfAdapter(OpenAIMixin):
return [
build_hf_repo_model_entry(
model.huggingface_repo,
model.descriptor(),
)
for model in all_registered_models()
if model.huggingface_repo
]
class _HfAdapter(
OpenAIMixin,
Inference,
):
url: str url: str
api_key: SecretStr api_key: SecretStr
@ -61,90 +29,14 @@ class _HfAdapter(
overwrite_completion_id = True # TGI always returns id="" overwrite_completion_id = True # TGI always returns id=""
def __init__(self) -> None:
self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
self.huggingface_repo_to_llama_model_id = {
model.huggingface_repo: model.descriptor() for model in all_registered_models() if model.huggingface_repo
}
def get_api_key(self): def get_api_key(self):
return self.api_key.get_secret_value() return self.api_key.get_secret_value()
def get_base_url(self): def get_base_url(self):
return self.url return self.url
async def shutdown(self) -> None: async def list_provider_model_ids(self) -> Iterable[str]:
pass return [self.model_id]
async def list_models(self) -> list[Model] | None:
models = []
async for model in self.client.models.list():
models.append(
Model(
identifier=model.id,
provider_resource_id=model.id,
provider_id=self.__provider_id__,
metadata={},
model_type=ModelType.llm,
)
)
return models
async def register_model(self, model: Model) -> Model:
if model.provider_resource_id != self.model_id:
raise ValueError(
f"Model {model.provider_resource_id} does not match the model {self.model_id} served by TGI."
)
return model
async def unregister_model(self, model_id: str) -> None:
pass
def _get_max_new_tokens(self, sampling_params, input_tokens):
return min(
sampling_params.max_tokens or (self.max_tokens - input_tokens),
self.max_tokens - input_tokens - 1,
)
def _build_options(
self,
sampling_params: SamplingParams | None = None,
fmt: ResponseFormat = None,
):
options = get_sampling_options(sampling_params)
# TGI does not support temperature=0 when using greedy sampling
# We set it to 1e-3 instead, anything lower outputs garbage from TGI
# We can use top_p sampling strategy to specify lower temperature
if abs(options["temperature"]) < 1e-10:
options["temperature"] = 1e-3
# delete key "max_tokens" from options since its not supported by the API
options.pop("max_tokens", None)
if fmt:
if fmt.type == ResponseFormatType.json_schema.value:
options["grammar"] = {
"type": "json",
"value": fmt.json_schema,
}
elif fmt.type == ResponseFormatType.grammar.value:
raise ValueError("Grammar response format not supported yet")
else:
raise ValueError(f"Unexpected response format: {fmt.type}")
return options
async def _get_params(self, request: ChatCompletionRequest) -> dict:
prompt, input_tokens = await chat_completion_request_to_model_input_info(
request, self.register_helper.get_llama_model(request.model)
)
return dict(
prompt=prompt,
stream=request.stream,
details=True,
max_new_tokens=self._get_max_new_tokens(request.sampling_params, input_tokens),
stop_sequences=["<|eom_id|>", "<|eot_id|>"],
**self._build_options(request.sampling_params, request.response_format),
)
async def openai_embeddings( async def openai_embeddings(
self, self,

View file

@ -17,6 +17,6 @@ async def get_adapter_impl(config: TogetherImplConfig, _deps):
from .together import TogetherInferenceAdapter from .together import TogetherInferenceAdapter
assert isinstance(config, TogetherImplConfig), f"Unexpected config type: {type(config)}" assert isinstance(config, TogetherImplConfig), f"Unexpected config type: {type(config)}"
impl = TogetherInferenceAdapter(config) impl = TogetherInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -5,41 +5,29 @@
# the root directory of this source tree. # the root directory of this source tree.
from openai import AsyncOpenAI from collections.abc import Iterable
from together import AsyncTogether from together import AsyncTogether
from together.constants import BASE_URL from together.constants import BASE_URL
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
ChatCompletionRequest,
Inference,
LogProbConfig,
OpenAIEmbeddingsResponse, OpenAIEmbeddingsResponse,
ResponseFormat,
ResponseFormatType,
SamplingParams,
) )
from llama_stack.apis.inference.inference import OpenAIEmbeddingUsage from llama_stack.apis.inference.inference import OpenAIEmbeddingUsage
from llama_stack.apis.models import Model, ModelType from llama_stack.apis.models import Model
from llama_stack.core.request_headers import NeedsRequestProviderData from llama_stack.core.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
from llama_stack.providers.utils.inference.openai_compat import (
convert_message_to_openai_dict,
get_sampling_options,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
request_has_media,
)
from .config import TogetherImplConfig from .config import TogetherImplConfig
logger = get_logger(name=__name__, category="inference::together") logger = get_logger(name=__name__, category="inference::together")
class TogetherInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData): class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
embedding_model_metadata = { config: TogetherImplConfig
embedding_model_metadata: dict[str, dict[str, int]] = {
"togethercomputer/m2-bert-80M-32k-retrieval": {"embedding_dimension": 768, "context_length": 32768}, "togethercomputer/m2-bert-80M-32k-retrieval": {"embedding_dimension": 768, "context_length": 32768},
"BAAI/bge-large-en-v1.5": {"embedding_dimension": 1024, "context_length": 512}, "BAAI/bge-large-en-v1.5": {"embedding_dimension": 1024, "context_length": 512},
"BAAI/bge-base-en-v1.5": {"embedding_dimension": 768, "context_length": 512}, "BAAI/bge-base-en-v1.5": {"embedding_dimension": 768, "context_length": 512},
@ -47,24 +35,16 @@ class TogetherInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData)
"intfloat/multilingual-e5-large-instruct": {"embedding_dimension": 1024, "context_length": 512}, "intfloat/multilingual-e5-large-instruct": {"embedding_dimension": 1024, "context_length": 512},
} }
def __init__(self, config: TogetherImplConfig) -> None: _model_cache: dict[str, Model] = {}
ModelRegistryHelper.__init__(self)
self.config = config provider_data_api_key_field: str = "together_api_key"
self.allowed_models = config.allowed_models
self._model_cache: dict[str, Model] = {}
def get_api_key(self): def get_api_key(self):
return self.config.api_key.get_secret_value() return self.config.api_key.get_secret_value() if self.config.api_key else None
def get_base_url(self): def get_base_url(self):
return BASE_URL return BASE_URL
async def initialize(self) -> None:
pass
async def shutdown(self) -> None:
pass
def _get_client(self) -> AsyncTogether: def _get_client(self) -> AsyncTogether:
together_api_key = None together_api_key = None
config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
@ -79,90 +59,13 @@ class TogetherInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData)
together_api_key = provider_data.together_api_key together_api_key = provider_data.together_api_key
return AsyncTogether(api_key=together_api_key) return AsyncTogether(api_key=together_api_key)
def _get_openai_client(self) -> AsyncOpenAI: async def list_provider_model_ids(self) -> Iterable[str]:
together_client = self._get_client().client
return AsyncOpenAI(
base_url=together_client.base_url,
api_key=together_client.api_key,
)
def _build_options(
self,
sampling_params: SamplingParams | None,
logprobs: LogProbConfig | None,
fmt: ResponseFormat,
) -> dict:
options = get_sampling_options(sampling_params)
if fmt:
if fmt.type == ResponseFormatType.json_schema.value:
options["response_format"] = {
"type": "json_object",
"schema": fmt.json_schema,
}
elif fmt.type == ResponseFormatType.grammar.value:
raise NotImplementedError("Grammar response format not supported yet")
else:
raise ValueError(f"Unknown response format {fmt.type}")
if logprobs and logprobs.top_k:
if logprobs.top_k != 1:
raise ValueError(
f"Unsupported value: Together only supports logprobs top_k=1. {logprobs.top_k} was provided",
)
options["logprobs"] = 1
return options
async def _get_params(self, request: ChatCompletionRequest) -> dict:
input_dict = {}
media_present = request_has_media(request)
llama_model = self.get_llama_model(request.model)
if media_present or not llama_model:
input_dict["messages"] = [await convert_message_to_openai_dict(m) for m in request.messages]
else:
input_dict["prompt"] = await chat_completion_request_to_prompt(request, llama_model)
params = {
"model": request.model,
**input_dict,
"stream": request.stream,
**self._build_options(request.sampling_params, request.logprobs, request.response_format),
}
logger.debug(f"params to together: {params}")
return params
async def list_models(self) -> list[Model] | None:
self._model_cache = {}
# Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client # Together's /v1/models is not compatible with OpenAI's /v1/models. Together support ticket #13355 -> will not fix, use Together's own client
for m in await self._get_client().models.list(): return [m.id for m in await self._get_client().models.list()]
if m.type == "embedding":
if m.id not in self.embedding_model_metadata:
logger.warning(f"Unknown embedding dimension for model {m.id}, skipping.")
continue
metadata = self.embedding_model_metadata[m.id]
self._model_cache[m.id] = Model(
provider_id=self.__provider_id__,
provider_resource_id=m.id,
identifier=m.id,
model_type=ModelType.embedding,
metadata=metadata,
)
else:
self._model_cache[m.id] = Model(
provider_id=self.__provider_id__,
provider_resource_id=m.id,
identifier=m.id,
model_type=ModelType.llm,
)
return self._model_cache.values()
async def should_refresh_models(self) -> bool: async def should_refresh_models(self) -> bool:
return True return True
async def check_model_availability(self, model):
return model in self._model_cache
async def openai_embeddings( async def openai_embeddings(
self, self,
model: str, model: str,
@ -203,4 +106,4 @@ class TogetherInferenceAdapter(OpenAIMixin, Inference, NeedsRequestProviderData)
) )
response.usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1) response.usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
return response return response # type: ignore[no-any-return]

View file

@ -10,6 +10,6 @@ from .config import VertexAIConfig
async def get_adapter_impl(config: VertexAIConfig, _deps): async def get_adapter_impl(config: VertexAIConfig, _deps):
from .vertexai import VertexAIInferenceAdapter from .vertexai import VertexAIInferenceAdapter
impl = VertexAIInferenceAdapter(config) impl = VertexAIInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -4,29 +4,19 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import Any
import google.auth.transport.requests import google.auth.transport.requests
from google.auth import default from google.auth import default
from llama_stack.apis.inference import ChatCompletionRequest
from llama_stack.providers.utils.inference.litellm_openai_mixin import (
LiteLLMOpenAIMixin,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from .config import VertexAIConfig from .config import VertexAIConfig
class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin): class VertexAIInferenceAdapter(OpenAIMixin):
def __init__(self, config: VertexAIConfig) -> None: config: VertexAIConfig
LiteLLMOpenAIMixin.__init__(
self, provider_data_api_key_field: str = "vertex_project"
litellm_provider_name="vertex_ai",
api_key_from_config=None, # Vertex AI uses ADC, not API keys
provider_data_api_key_field="vertex_project", # Use project for validation
)
self.config = config
def get_api_key(self) -> str: def get_api_key(self) -> str:
""" """
@ -41,8 +31,7 @@ class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
credentials.refresh(google.auth.transport.requests.Request()) credentials.refresh(google.auth.transport.requests.Request())
return str(credentials.token) return str(credentials.token)
except Exception: except Exception:
# If we can't get credentials, return empty string to let LiteLLM handle it # If we can't get credentials, return empty string to let the env work with ADC directly
# This allows the LiteLLM mixin to work with ADC directly
return "" return ""
def get_base_url(self) -> str: def get_base_url(self) -> str:
@ -53,23 +42,3 @@ class VertexAIInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin):
Source: https://cloud.google.com/vertex-ai/generative-ai/docs/start/openai Source: https://cloud.google.com/vertex-ai/generative-ai/docs/start/openai
""" """
return f"https://{self.config.location}-aiplatform.googleapis.com/v1/projects/{self.config.project}/locations/{self.config.location}/endpoints/openapi" return f"https://{self.config.location}-aiplatform.googleapis.com/v1/projects/{self.config.project}/locations/{self.config.location}/endpoints/openapi"
async def _get_params(self, request: ChatCompletionRequest) -> dict[str, Any]:
# Get base parameters from parent
params = await super()._get_params(request)
# Add Vertex AI specific parameters
provider_data = self.get_request_provider_data()
if provider_data:
if getattr(provider_data, "vertex_project", None):
params["vertex_project"] = provider_data.vertex_project
if getattr(provider_data, "vertex_location", None):
params["vertex_location"] = provider_data.vertex_location
else:
params["vertex_project"] = self.config.project
params["vertex_location"] = self.config.location
# Remove api_key since Vertex AI uses ADC
params.pop("api_key", None)
return params

View file

@ -17,6 +17,6 @@ async def get_adapter_impl(config: VLLMInferenceAdapterConfig, _deps):
from .vllm import VLLMInferenceAdapter from .vllm import VLLMInferenceAdapter
assert isinstance(config, VLLMInferenceAdapterConfig), f"Unexpected config type: {type(config)}" assert isinstance(config, VLLMInferenceAdapterConfig), f"Unexpected config type: {type(config)}"
impl = VLLMInferenceAdapter(config) impl = VLLMInferenceAdapter(config=config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -3,56 +3,26 @@
# #
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import json from collections.abc import AsyncIterator
from collections.abc import AsyncGenerator, AsyncIterator
from typing import Any from typing import Any
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx import httpx
from openai import APIConnectionError
from openai.types.chat.chat_completion_chunk import ( from openai.types.chat.chat_completion_chunk import (
ChatCompletionChunk as OpenAIChatCompletionChunk, ChatCompletionChunk as OpenAIChatCompletionChunk,
) )
from pydantic import ConfigDict
from llama_stack.apis.common.content_types import (
TextDelta,
ToolCallDelta,
ToolCallParseStatus,
)
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponseEvent,
ChatCompletionResponseEventType,
ChatCompletionResponseStreamChunk,
GrammarResponseFormat,
Inference,
JsonSchemaResponseFormat,
ModelStore,
OpenAIChatCompletion, OpenAIChatCompletion,
OpenAIMessageParam, OpenAIMessageParam,
OpenAIResponseFormatParam, OpenAIResponseFormatParam,
ToolChoice, ToolChoice,
ToolDefinition,
) )
from llama_stack.apis.models import Model, ModelType
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall
from llama_stack.models.llama.sku_list import all_registered_models
from llama_stack.providers.datatypes import ( from llama_stack.providers.datatypes import (
HealthResponse, HealthResponse,
HealthStatus, HealthStatus,
ModelsProtocolPrivate,
)
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
from llama_stack.providers.utils.inference.model_registry import (
ModelRegistryHelper,
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
UnparseableToolCall,
convert_message_to_openai_dict,
convert_tool_call,
get_sampling_options,
) )
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
@ -61,210 +31,15 @@ from .config import VLLMInferenceAdapterConfig
log = get_logger(name=__name__, category="inference::vllm") log = get_logger(name=__name__, category="inference::vllm")
def build_hf_repo_model_entries(): class VLLMInferenceAdapter(OpenAIMixin):
return [ config: VLLMInferenceAdapterConfig
build_hf_repo_model_entry(
model.huggingface_repo,
model.descriptor(),
)
for model in all_registered_models()
if model.huggingface_repo
]
model_config = ConfigDict(arbitrary_types_allowed=True)
def _convert_to_vllm_tool_calls_in_response( provider_data_api_key_field: str = "vllm_api_token"
tool_calls,
) -> list[ToolCall]:
if not tool_calls:
return []
return [ def get_api_key(self) -> str:
ToolCall( return self.config.api_token or ""
call_id=call.id,
tool_name=call.function.name,
arguments=call.function.arguments,
)
for call in tool_calls
]
def _convert_to_vllm_tools_in_request(tools: list[ToolDefinition]) -> list[dict]:
compat_tools = []
for tool in tools:
# The tool.tool_name can be a str or a BuiltinTool enum. If
# it's the latter, convert to a string.
tool_name = tool.tool_name
if isinstance(tool_name, BuiltinTool):
tool_name = tool_name.value
compat_tool = {
"type": "function",
"function": {
"name": tool_name,
"description": tool.description,
"parameters": tool.input_schema
or {
"type": "object",
"properties": {},
"required": [],
},
},
}
compat_tools.append(compat_tool)
return compat_tools
def _convert_to_vllm_finish_reason(finish_reason: str) -> StopReason:
return {
"stop": StopReason.end_of_turn,
"length": StopReason.out_of_tokens,
"tool_calls": StopReason.end_of_message,
}.get(finish_reason, StopReason.end_of_turn)
def _process_vllm_chat_completion_end_of_stream(
finish_reason: str | None,
last_chunk_content: str | None,
current_event_type: ChatCompletionResponseEventType,
tool_call_bufs: dict[str, UnparseableToolCall] | None = None,
) -> list[OpenAIChatCompletionChunk]:
chunks = []
if finish_reason is not None:
stop_reason = _convert_to_vllm_finish_reason(finish_reason)
else:
stop_reason = StopReason.end_of_message
tool_call_bufs = tool_call_bufs or {}
for _index, tool_call_buf in sorted(tool_call_bufs.items()):
args_str = tool_call_buf.arguments or "{}"
try:
chunks.append(
ChatCompletionResponseStreamChunk(
event=ChatCompletionResponseEvent(
event_type=current_event_type,
delta=ToolCallDelta(
tool_call=ToolCall(
call_id=tool_call_buf.call_id,
tool_name=tool_call_buf.tool_name,
arguments=args_str,
),
parse_status=ToolCallParseStatus.succeeded,
),
)
)
)
except Exception as e:
log.warning(f"Failed to parse tool call buffer arguments: {args_str} \nError: {e}")
chunks.append(
ChatCompletionResponseStreamChunk(
event=ChatCompletionResponseEvent(
event_type=ChatCompletionResponseEventType.progress,
delta=ToolCallDelta(
tool_call=str(tool_call_buf),
parse_status=ToolCallParseStatus.failed,
),
)
)
)
chunks.append(
ChatCompletionResponseStreamChunk(
event=ChatCompletionResponseEvent(
event_type=ChatCompletionResponseEventType.complete,
delta=TextDelta(text=last_chunk_content or ""),
logprobs=None,
stop_reason=stop_reason,
)
)
)
return chunks
async def _process_vllm_chat_completion_stream_response(
stream: AsyncGenerator[OpenAIChatCompletionChunk, None],
) -> AsyncGenerator:
yield ChatCompletionResponseStreamChunk(
event=ChatCompletionResponseEvent(
event_type=ChatCompletionResponseEventType.start,
delta=TextDelta(text=""),
)
)
event_type = ChatCompletionResponseEventType.progress
tool_call_bufs: dict[str, UnparseableToolCall] = {}
end_of_stream_processed = False
async for chunk in stream:
if not chunk.choices:
log.warning("vLLM failed to generation any completions - check the vLLM server logs for an error.")
return
choice = chunk.choices[0]
if choice.delta.tool_calls:
for delta_tool_call in choice.delta.tool_calls:
tool_call = convert_tool_call(delta_tool_call)
if delta_tool_call.index not in tool_call_bufs:
tool_call_bufs[delta_tool_call.index] = UnparseableToolCall()
tool_call_buf = tool_call_bufs[delta_tool_call.index]
tool_call_buf.tool_name += str(tool_call.tool_name)
tool_call_buf.call_id += tool_call.call_id
tool_call_buf.arguments += (
tool_call.arguments if isinstance(tool_call.arguments, str) else json.dumps(tool_call.arguments)
)
if choice.finish_reason:
chunks = _process_vllm_chat_completion_end_of_stream(
finish_reason=choice.finish_reason,
last_chunk_content=choice.delta.content,
current_event_type=event_type,
tool_call_bufs=tool_call_bufs,
)
for c in chunks:
yield c
end_of_stream_processed = True
elif not choice.delta.tool_calls:
yield ChatCompletionResponseStreamChunk(
event=ChatCompletionResponseEvent(
event_type=event_type,
delta=TextDelta(text=choice.delta.content or ""),
logprobs=None,
)
)
event_type = ChatCompletionResponseEventType.progress
if end_of_stream_processed:
return
# the stream ended without a chunk containing finish_reason - we have to generate the
# respective completion chunks manually
chunks = _process_vllm_chat_completion_end_of_stream(
finish_reason=None, last_chunk_content=None, current_event_type=event_type, tool_call_bufs=tool_call_bufs
)
for c in chunks:
yield c
class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsProtocolPrivate):
# automatically set by the resolver when instantiating the provider
__provider_id__: str
model_store: ModelStore | None = None
def __init__(self, config: VLLMInferenceAdapterConfig) -> None:
LiteLLMOpenAIMixin.__init__(
self,
model_entries=build_hf_repo_model_entries(),
litellm_provider_name="vllm",
api_key_from_config=config.api_token,
provider_data_api_key_field="vllm_api_token",
openai_compat_api_base=config.url,
)
self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
self.config = config
get_api_key = LiteLLMOpenAIMixin.get_api_key
def get_base_url(self) -> str: def get_base_url(self) -> str:
"""Get the base URL from config.""" """Get the base URL from config."""
@ -282,27 +57,6 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro
# Strictly respecting the refresh_models directive # Strictly respecting the refresh_models directive
return self.config.refresh_models return self.config.refresh_models
async def list_models(self) -> list[Model] | None:
models = []
async for m in self.client.models.list():
model_type = ModelType.llm # unclear how to determine embedding vs. llm models
models.append(
Model(
identifier=m.id,
provider_resource_id=m.id,
provider_id=self.__provider_id__,
metadata={},
model_type=model_type,
)
)
return models
async def shutdown(self) -> None:
pass
async def unregister_model(self, model_id: str) -> None:
pass
async def health(self) -> HealthResponse: async def health(self) -> HealthResponse:
""" """
Performs a health check by verifying connectivity to the remote vLLM server. Performs a health check by verifying connectivity to the remote vLLM server.
@ -324,63 +78,9 @@ class VLLMInferenceAdapter(OpenAIMixin, LiteLLMOpenAIMixin, Inference, ModelsPro
except Exception as e: except Exception as e:
return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}") return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
async def _get_model(self, model_id: str) -> Model:
if not self.model_store:
raise ValueError("Model store not set")
return await self.model_store.get_model(model_id)
def get_extra_client_params(self): def get_extra_client_params(self):
return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)} return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)}
async def register_model(self, model: Model) -> Model:
try:
model = await self.register_helper.register_model(model)
except ValueError:
pass # Ignore statically unknown model, will check live listing
try:
res = self.client.models.list()
except APIConnectionError as e:
raise ValueError(
f"Failed to connect to vLLM at {self.config.url}. Please check if vLLM is running and accessible at that URL."
) from e
available_models = [m.id async for m in res]
if model.provider_resource_id not in available_models:
raise ValueError(
f"Model {model.provider_resource_id} is not being served by vLLM. "
f"Available models: {', '.join(available_models)}"
)
return model
async def _get_params(self, request: ChatCompletionRequest) -> dict:
options = get_sampling_options(request.sampling_params)
if "max_tokens" not in options:
options["max_tokens"] = self.config.max_tokens
input_dict: dict[str, Any] = {}
# Only include the 'tools' param if there is any. It can break things if an empty list is sent to the vLLM.
if isinstance(request, ChatCompletionRequest) and request.tools:
input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}
input_dict["messages"] = [await convert_message_to_openai_dict(m, download=True) for m in request.messages]
if fmt := request.response_format:
if isinstance(fmt, JsonSchemaResponseFormat):
input_dict["extra_body"] = {"guided_json": fmt.json_schema}
elif isinstance(fmt, GrammarResponseFormat):
raise NotImplementedError("Grammar response format not supported yet")
else:
raise ValueError(f"Unknown response format {fmt.type}")
if request.logprobs and request.logprobs.top_k:
input_dict["logprobs"] = request.logprobs.top_k
return {
"model": request.model,
**input_dict,
"stream": request.stream,
**options,
}
async def openai_chat_completion( async def openai_chat_completion(
self, self,
model: str, model: str,

View file

@ -7,10 +7,11 @@
import base64 import base64
import uuid import uuid
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections.abc import AsyncIterator from collections.abc import AsyncIterator, Iterable
from typing import Any from typing import Any
from openai import NOT_GIVEN, AsyncOpenAI from openai import NOT_GIVEN, AsyncOpenAI
from pydantic import BaseModel, ConfigDict
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
Model, Model,
@ -26,14 +27,14 @@ from llama_stack.apis.inference import (
from llama_stack.apis.models import ModelType from llama_stack.apis.models import ModelType
from llama_stack.core.request_headers import NeedsRequestProviderData from llama_stack.core.request_headers import NeedsRequestProviderData
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.providers.datatypes import ModelsProtocolPrivate from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
from llama_stack.providers.utils.inference.prompt_adapter import localize_image_content from llama_stack.providers.utils.inference.prompt_adapter import localize_image_content
logger = get_logger(name=__name__, category="providers::utils") logger = get_logger(name=__name__, category="providers::utils")
class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC): class OpenAIMixin(NeedsRequestProviderData, ABC, BaseModel):
""" """
Mixin class that provides OpenAI-specific functionality for inference providers. Mixin class that provides OpenAI-specific functionality for inference providers.
This class handles direct OpenAI API calls using the AsyncOpenAI client. This class handles direct OpenAI API calls using the AsyncOpenAI client.
@ -42,12 +43,25 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
- get_api_key(): Method to retrieve the API key - get_api_key(): Method to retrieve the API key
- get_base_url(): Method to retrieve the OpenAI-compatible API base URL - get_base_url(): Method to retrieve the OpenAI-compatible API base URL
The behavior of this class can be customized by child classes in the following ways:
- overwrite_completion_id: If True, overwrites the 'id' field in OpenAI responses
- download_images: If True, downloads images and converts to base64 for providers that require it
- embedding_model_metadata: A dictionary mapping model IDs to their embedding metadata
- provider_data_api_key_field: Optional field name in provider data to look for API key
- list_provider_model_ids: Method to list available models from the provider
- get_extra_client_params: Method to provide extra parameters to the AsyncOpenAI client
Expected Dependencies: Expected Dependencies:
- self.model_store: Injected by the Llama Stack distribution system at runtime. - self.model_store: Injected by the Llama Stack distribution system at runtime.
This provides model registry functionality for looking up registered models. This provides model registry functionality for looking up registered models.
The model_store is set in routing_tables/common.py during provider initialization. The model_store is set in routing_tables/common.py during provider initialization.
""" """
# Allow extra fields so the routing infra can inject model_store, __provider_id__, etc.
model_config = ConfigDict(extra="allow")
config: RemoteInferenceProviderConfig
# Allow subclasses to control whether to overwrite the 'id' field in OpenAI responses # Allow subclasses to control whether to overwrite the 'id' field in OpenAI responses
# is overwritten with a client-side generated id. # is overwritten with a client-side generated id.
# #
@ -73,9 +87,6 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
# Optional field name in provider data to look for API key, which takes precedence # Optional field name in provider data to look for API key, which takes precedence
provider_data_api_key_field: str | None = None provider_data_api_key_field: str | None = None
# automatically set by the resolver when instantiating the provider
__provider_id__: str
@abstractmethod @abstractmethod
def get_api_key(self) -> str: def get_api_key(self) -> str:
""" """
@ -111,6 +122,38 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
""" """
return {} return {}
async def list_provider_model_ids(self) -> Iterable[str]:
"""
List available models from the provider.
Child classes can override this method to provide a custom implementation
for listing models. The default implementation uses the AsyncOpenAI client
to list models from the OpenAI-compatible endpoint.
:return: An iterable of model IDs or None if not implemented
"""
return [m.id async for m in self.client.models.list()]
async def initialize(self) -> None:
"""
Initialize the OpenAI mixin.
This method provides a default implementation that does nothing.
Subclasses can override this method to perform initialization tasks
such as setting up clients, validating configurations, etc.
"""
pass
async def shutdown(self) -> None:
"""
Shutdown the OpenAI mixin.
This method provides a default implementation that does nothing.
Subclasses can override this method to perform cleanup tasks
such as closing connections, releasing resources, etc.
"""
pass
@property @property
def client(self) -> AsyncOpenAI: def client(self) -> AsyncOpenAI:
""" """
@ -371,7 +414,7 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
async def register_model(self, model: Model) -> Model: async def register_model(self, model: Model) -> Model:
if not await self.check_model_availability(model.provider_model_id): if not await self.check_model_availability(model.provider_model_id):
raise ValueError(f"Model {model.provider_model_id} is not available from provider {self.__provider_id__}") raise ValueError(f"Model {model.provider_model_id} is not available from provider {self.__provider_id__}") # type: ignore[attr-defined]
return model return model
async def unregister_model(self, model_id: str) -> None: async def unregister_model(self, model_id: str) -> None:
@ -387,28 +430,42 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
""" """
self._model_cache = {} self._model_cache = {}
async for m in self.client.models.list(): try:
if self.allowed_models and m.id not in self.allowed_models: iterable = await self.list_provider_model_ids()
logger.info(f"Skipping model {m.id} as it is not in the allowed models list") except Exception as e:
logger.error(f"{self.__class__.__name__}.list_provider_model_ids() failed with: {e}")
raise
if not hasattr(iterable, "__iter__"):
raise TypeError(
f"Failed to list models: {self.__class__.__name__}.list_provider_model_ids() must return an iterable of "
f"strings, but returned {type(iterable).__name__}"
)
provider_models_ids = list(iterable)
logger.info(f"{self.__class__.__name__}.list_provider_model_ids() returned {len(provider_models_ids)} models")
for provider_model_id in provider_models_ids:
if not isinstance(provider_model_id, str):
raise ValueError(f"Model ID {provider_model_id} from list_provider_model_ids() is not a string")
if self.allowed_models and provider_model_id not in self.allowed_models:
logger.info(f"Skipping model {provider_model_id} as it is not in the allowed models list")
continue continue
if metadata := self.embedding_model_metadata.get(m.id): if metadata := self.embedding_model_metadata.get(provider_model_id):
# This is an embedding model - augment with metadata
model = Model( model = Model(
provider_id=self.__provider_id__, # type: ignore[attr-defined] provider_id=self.__provider_id__, # type: ignore[attr-defined]
provider_resource_id=m.id, provider_resource_id=provider_model_id,
identifier=m.id, identifier=provider_model_id,
model_type=ModelType.embedding, model_type=ModelType.embedding,
metadata=metadata, metadata=metadata,
) )
else: else:
# This is an LLM
model = Model( model = Model(
provider_id=self.__provider_id__, # type: ignore[attr-defined] provider_id=self.__provider_id__, # type: ignore[attr-defined]
provider_resource_id=m.id, provider_resource_id=provider_model_id,
identifier=m.id, identifier=provider_model_id,
model_type=ModelType.llm, model_type=ModelType.llm,
) )
self._model_cache[m.id] = model self._model_cache[provider_model_id] = model
return list(self._model_cache.values()) return list(self._model_cache.values())
@ -425,3 +482,29 @@ class OpenAIMixin(ModelsProtocolPrivate, NeedsRequestProviderData, ABC):
async def should_refresh_models(self) -> bool: async def should_refresh_models(self) -> bool:
return False return False
#
# The model_dump implementations are to avoid serializing the extra fields,
# e.g. model_store, which are not pydantic.
#
def _filter_fields(self, **kwargs):
"""Helper to exclude extra fields from serialization."""
# Exclude any extra fields stored in __pydantic_extra__
if hasattr(self, "__pydantic_extra__") and self.__pydantic_extra__:
exclude = kwargs.get("exclude", set())
if not isinstance(exclude, set):
exclude = set(exclude) if exclude else set()
exclude.update(self.__pydantic_extra__.keys())
kwargs["exclude"] = exclude
return kwargs
def model_dump(self, **kwargs):
"""Override to exclude extra fields from serialization."""
kwargs = self._filter_fields(**kwargs)
return super().model_dump(**kwargs)
def model_dump_json(self, **kwargs):
"""Override to exclude extra fields from JSON serialization."""
kwargs = self._filter_fields(**kwargs)
return super().model_dump_json(**kwargs)

View file

@ -11,6 +11,43 @@ from typing import Any, TypeVar
from .strong_typing.schema import json_schema_type, register_schema # noqa: F401 from .strong_typing.schema import json_schema_type, register_schema # noqa: F401
class ExtraBodyField[T]:
"""
Marker annotation for parameters that arrive via extra_body in the client SDK.
These parameters:
- Will NOT appear in the generated client SDK method signature
- WILL be documented in OpenAPI spec under x-llama-stack-extra-body-params
- MUST be passed via the extra_body parameter in client SDK calls
- WILL be available in server-side method signature with proper typing
Example:
```python
async def create_openai_response(
self,
input: str,
model: str,
shields: Annotated[
list[str] | None, ExtraBodyField("List of shields to apply")
] = None,
) -> ResponseObject:
# shields is available here with proper typing
if shields:
print(f"Using shields: {shields}")
```
Client usage:
```python
client.responses.create(
input="hello", model="llama-3", extra_body={"shields": ["shield-1"]}
)
```
"""
def __init__(self, description: str | None = None):
self.description = description
@dataclass @dataclass
class WebMethod: class WebMethod:
level: str | None = None level: str | None = None
@ -26,7 +63,7 @@ class WebMethod:
deprecated: bool | None = False deprecated: bool | None = False
T = TypeVar("T", bound=Callable[..., Any]) CallableT = TypeVar("CallableT", bound=Callable[..., Any])
def webmethod( def webmethod(
@ -40,7 +77,7 @@ def webmethod(
descriptive_name: str | None = None, descriptive_name: str | None = None,
required_scope: str | None = None, required_scope: str | None = None,
deprecated: bool | None = False, deprecated: bool | None = False,
) -> Callable[[T], T]: ) -> Callable[[CallableT], CallableT]:
""" """
Decorator that supplies additional metadata to an endpoint operation function. Decorator that supplies additional metadata to an endpoint operation function.
@ -51,7 +88,7 @@ def webmethod(
:param required_scope: Required scope for this endpoint (e.g., 'monitoring.viewer'). :param required_scope: Required scope for this endpoint (e.g., 'monitoring.viewer').
""" """
def wrap(func: T) -> T: def wrap(func: CallableT) -> CallableT:
webmethod_obj = WebMethod( webmethod_obj = WebMethod(
route=route, route=route,
method=method, method=method,

View file

@ -22,10 +22,18 @@ from llama_stack.log import get_logger
logger = get_logger(__name__, category="testing") logger = get_logger(__name__, category="testing")
# Global state for the recording system # Global state for the recording system
# Note: Using module globals instead of ContextVars because the session-scoped
# client initialization happens in one async context, but tests run in different
# contexts, and we need the mode/storage to persist across all contexts.
_current_mode: str | None = None _current_mode: str | None = None
_current_storage: ResponseStorage | None = None _current_storage: ResponseStorage | None = None
_original_methods: dict[str, Any] = {} _original_methods: dict[str, Any] = {}
# Test context uses ContextVar since it changes per-test and needs async isolation
from contextvars import ContextVar
_test_context: ContextVar[str | None] = ContextVar("_test_context", default=None)
from openai.types.completion_choice import CompletionChoice from openai.types.completion_choice import CompletionChoice
# update the "finish_reason" field, since its type definition is wrong (no None is accepted) # update the "finish_reason" field, since its type definition is wrong (no None is accepted)
@ -33,22 +41,38 @@ CompletionChoice.model_fields["finish_reason"].annotation = Literal["stop", "len
CompletionChoice.model_rebuild() CompletionChoice.model_rebuild()
REPO_ROOT = Path(__file__).parent.parent.parent REPO_ROOT = Path(__file__).parent.parent.parent
DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/recordings" DEFAULT_STORAGE_DIR = REPO_ROOT / "tests/integration/common"
class InferenceMode(StrEnum): class InferenceMode(StrEnum):
LIVE = "live" LIVE = "live"
RECORD = "record" RECORD = "record"
REPLAY = "replay" REPLAY = "replay"
RECORD_IF_MISSING = "record-if-missing"
def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict[str, Any]) -> str: def normalize_request(method: str, url: str, headers: dict[str, Any], body: dict[str, Any]) -> str:
"""Create a normalized hash of the request for consistent matching.""" """Create a normalized hash of the request for consistent matching.
Includes test_id from context to ensure test isolation - identical requests
from different tests will have different hashes.
Exception: Model list endpoints (/v1/models, /api/tags) exclude test_id since
they are infrastructure/shared and need to work across session setup and tests.
"""
# Extract just the endpoint path # Extract just the endpoint path
from urllib.parse import urlparse from urllib.parse import urlparse
parsed = urlparse(url) parsed = urlparse(url)
normalized = {"method": method.upper(), "endpoint": parsed.path, "body": body} normalized: dict[str, Any] = {
"method": method.upper(),
"endpoint": parsed.path,
"body": body,
}
# Include test_id for isolation, except for shared infrastructure endpoints
if parsed.path not in ("/api/tags", "/v1/models"):
normalized["test_id"] = _test_context.get()
# Create hash - sort_keys=True ensures deterministic ordering # Create hash - sort_keys=True ensures deterministic ordering
normalized_json = json.dumps(normalized, sort_keys=True) normalized_json = json.dumps(normalized, sort_keys=True)
@ -67,7 +91,11 @@ def setup_inference_recording():
Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases. Currently, this is only supported for OpenAI and Ollama clients. These should cover the vast majority of use cases.
Two environment variables are supported: Two environment variables are supported:
- LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', or 'replay'. Default is 'replay'. - LLAMA_STACK_TEST_INFERENCE_MODE: The mode to run in. Must be 'live', 'record', 'replay', or 'record-if-missing'. Default is 'replay'.
- 'live': Make all requests live without recording
- 'record': Record all requests (overwrites existing recordings)
- 'replay': Use only recorded responses (fails if recording not found)
- 'record-if-missing': Use recorded responses when available, record new ones when not found
- LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'. - LLAMA_STACK_TEST_RECORDING_DIR: The directory to store the recordings in. Default is 'tests/integration/recordings'.
The recordings are stored as JSON files. The recordings are stored as JSON files.
@ -80,9 +108,43 @@ def setup_inference_recording():
return inference_recording(mode=mode, storage_dir=storage_dir) return inference_recording(mode=mode, storage_dir=storage_dir)
def _serialize_response(response: Any) -> Any: def _normalize_response_data(data: dict[str, Any], request_hash: str) -> dict[str, Any]:
"""Normalize fields that change between recordings but don't affect functionality.
This reduces noise in git diffs by making IDs deterministic and timestamps constant.
"""
# Only normalize ID for completion/chat responses, not for model objects
# Model objects have "object": "model" and the ID is the actual model identifier
if "id" in data and data.get("object") != "model":
data["id"] = f"rec-{request_hash[:12]}"
# Normalize timestamp to epoch (0) (for OpenAI-style responses)
# But not for model objects where created timestamp might be meaningful
if "created" in data and data.get("object") != "model":
data["created"] = 0
# Normalize Ollama-specific timestamp fields
if "created_at" in data:
data["created_at"] = "1970-01-01T00:00:00.000000Z"
# Normalize Ollama-specific duration fields (these vary based on system load)
if "total_duration" in data and data["total_duration"] is not None:
data["total_duration"] = 0
if "load_duration" in data and data["load_duration"] is not None:
data["load_duration"] = 0
if "prompt_eval_duration" in data and data["prompt_eval_duration"] is not None:
data["prompt_eval_duration"] = 0
if "eval_duration" in data and data["eval_duration"] is not None:
data["eval_duration"] = 0
return data
def _serialize_response(response: Any, request_hash: str = "") -> Any:
if hasattr(response, "model_dump"): if hasattr(response, "model_dump"):
data = response.model_dump(mode="json") data = response.model_dump(mode="json")
# Normalize fields to reduce noise
data = _normalize_response_data(data, request_hash)
return { return {
"__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}", "__type__": f"{response.__class__.__module__}.{response.__class__.__qualname__}",
"__data__": data, "__data__": data,
@ -120,61 +182,121 @@ def _deserialize_response(data: dict[str, Any]) -> Any:
class ResponseStorage: class ResponseStorage:
"""Handles SQLite index + JSON file storage/retrieval for inference recordings.""" """Handles SQLite index + JSON file storage/retrieval for inference recordings."""
def __init__(self, test_dir: Path): def __init__(self, base_dir: Path):
self.test_dir = test_dir self.base_dir = base_dir
self.responses_dir = self.test_dir / "responses" # Don't create responses_dir here - determine it per-test at runtime
self._ensure_directories() def _get_test_dir(self) -> Path:
"""Get the recordings directory in the test file's parent directory.
For test at "tests/integration/inference/test_foo.py::test_bar",
returns "tests/integration/inference/recordings/".
"""
test_id = _test_context.get()
if test_id:
# Extract the directory path from the test nodeid
# e.g., "tests/integration/inference/test_basic.py::test_foo[params]"
# -> get "tests/integration/inference"
test_file = test_id.split("::")[0] # Remove test function part
test_dir = Path(test_file).parent # Get parent directory
# Put recordings in a "recordings" subdirectory of the test's parent dir
# e.g., "tests/integration/inference" -> "tests/integration/inference/recordings"
return test_dir / "recordings"
else:
# Fallback for non-test contexts
return self.base_dir / "recordings"
def _ensure_directories(self): def _ensure_directories(self):
self.test_dir.mkdir(parents=True, exist_ok=True) """Ensure test-specific directories exist."""
self.responses_dir.mkdir(exist_ok=True) test_dir = self._get_test_dir()
test_dir.mkdir(parents=True, exist_ok=True)
return test_dir
def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]): def store_recording(self, request_hash: str, request: dict[str, Any], response: dict[str, Any]):
"""Store a request/response pair.""" """Store a request/response pair."""
# Generate unique response filename responses_dir = self._ensure_directories()
short_hash = request_hash[:12]
response_file = f"{short_hash}.json" # Use FULL hash (not truncated)
response_file = f"{request_hash}.json"
# Serialize response body if needed # Serialize response body if needed
serialized_response = dict(response) serialized_response = dict(response)
if "body" in serialized_response: if "body" in serialized_response:
if isinstance(serialized_response["body"], list): if isinstance(serialized_response["body"], list):
# Handle streaming responses (list of chunks) # Handle streaming responses (list of chunks)
serialized_response["body"] = [_serialize_response(chunk) for chunk in serialized_response["body"]] serialized_response["body"] = [
_serialize_response(chunk, request_hash) for chunk in serialized_response["body"]
]
else: else:
# Handle single response # Handle single response
serialized_response["body"] = _serialize_response(serialized_response["body"]) serialized_response["body"] = _serialize_response(serialized_response["body"], request_hash)
# If this is an Ollama /api/tags recording, include models digest in filename to distinguish variants # For model-list endpoints, include digest in filename to distinguish different model sets
endpoint = request.get("endpoint") endpoint = request.get("endpoint")
if endpoint in ("/api/tags", "/v1/models"): if endpoint in ("/api/tags", "/v1/models"):
digest = _model_identifiers_digest(endpoint, response) digest = _model_identifiers_digest(endpoint, response)
response_file = f"models-{short_hash}-{digest}.json" response_file = f"models-{request_hash}-{digest}.json"
response_path = self.responses_dir / response_file response_path = responses_dir / response_file
# Save response to JSON file # Save response to JSON file with metadata
with open(response_path, "w") as f: with open(response_path, "w") as f:
json.dump({"request": request, "response": serialized_response}, f, indent=2) json.dump(
{
"test_id": _test_context.get(), # Include for debugging
"request": request,
"response": serialized_response,
},
f,
indent=2,
)
f.write("\n") f.write("\n")
f.flush() f.flush()
def find_recording(self, request_hash: str) -> dict[str, Any] | None: def find_recording(self, request_hash: str) -> dict[str, Any] | None:
"""Find a recorded response by request hash.""" """Find a recorded response by request hash.
response_file = f"{request_hash[:12]}.json"
response_path = self.responses_dir / response_file
if not response_path.exists(): Uses fallback: first checks test-specific dir, then falls back to base recordings dir.
return None This handles cases where recordings happen during session setup (no test context) but
are requested during tests (with test context).
"""
response_file = f"{request_hash}.json"
return _recording_from_file(response_path) # Try test-specific directory first
test_dir = self._get_test_dir()
response_path = test_dir / response_file
def _model_list_responses(self, short_hash: str) -> list[dict[str, Any]]: if response_path.exists():
return _recording_from_file(response_path)
# Fallback to base recordings directory (for session-level recordings)
fallback_dir = self.base_dir / "recordings"
fallback_path = fallback_dir / response_file
if fallback_path.exists():
return _recording_from_file(fallback_path)
return None
def _model_list_responses(self, request_hash: str) -> list[dict[str, Any]]:
"""Find all model-list recordings with the given hash (different digests)."""
results: list[dict[str, Any]] = [] results: list[dict[str, Any]] = []
for path in self.responses_dir.glob(f"models-{short_hash}-*.json"):
data = _recording_from_file(path) # Check test-specific directory first
results.append(data) test_dir = self._get_test_dir()
if test_dir.exists():
for path in test_dir.glob(f"models-{request_hash}-*.json"):
data = _recording_from_file(path)
results.append(data)
# Also check fallback directory
fallback_dir = self.base_dir / "recordings"
if fallback_dir.exists():
for path in fallback_dir.glob(f"models-{request_hash}-*.json"):
data = _recording_from_file(path)
results.append(data)
return results return results
@ -195,6 +317,8 @@ def _recording_from_file(response_path) -> dict[str, Any]:
def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str: def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
"""Generate a digest from model identifiers for distinguishing different model sets."""
def _extract_model_identifiers(): def _extract_model_identifiers():
"""Extract a stable set of identifiers for model-list endpoints. """Extract a stable set of identifiers for model-list endpoints.
@ -217,7 +341,14 @@ def _model_identifiers_digest(endpoint: str, response: dict[str, Any]) -> str:
def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None: def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
"""Return a single, unioned recording for supported model-list endpoints.""" """Return a single, unioned recording for supported model-list endpoints.
Merges multiple recordings with different model sets (from different servers) into
a single response containing all models.
"""
if not records:
return None
seen: dict[str, dict[str, Any]] = {} seen: dict[str, dict[str, Any]] = {}
for rec in records: for rec in records:
body = rec["response"]["body"] body = rec["response"]["body"]
@ -246,7 +377,10 @@ def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]])
async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs): async def _patched_inference_method(original_method, self, client_type, endpoint, *args, **kwargs):
global _current_mode, _current_storage global _current_mode, _current_storage
if _current_mode == InferenceMode.LIVE or _current_storage is None: mode = _current_mode
storage = _current_storage
if mode == InferenceMode.LIVE or storage is None:
if endpoint == "/v1/models": if endpoint == "/v1/models":
return original_method(self, *args, **kwargs) return original_method(self, *args, **kwargs)
else: else:
@ -277,13 +411,16 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
request_hash = normalize_request(method, url, headers, body) request_hash = normalize_request(method, url, headers, body)
if _current_mode == InferenceMode.REPLAY: # Try to find existing recording for REPLAY or RECORD_IF_MISSING modes
# Special handling for model-list endpoints: return union of all responses recording = None
if mode == InferenceMode.REPLAY or mode == InferenceMode.RECORD_IF_MISSING:
# Special handling for model-list endpoints: merge all recordings with this hash
if endpoint in ("/api/tags", "/v1/models"): if endpoint in ("/api/tags", "/v1/models"):
records = _current_storage._model_list_responses(request_hash[:12]) records = storage._model_list_responses(request_hash)
recording = _combine_model_list_responses(endpoint, records) recording = _combine_model_list_responses(endpoint, records)
else: else:
recording = _current_storage.find_recording(request_hash) recording = storage.find_recording(request_hash)
if recording: if recording:
response_body = recording["response"]["body"] response_body = recording["response"]["body"]
@ -296,7 +433,8 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
return replay_stream() return replay_stream()
else: else:
return response_body return response_body
else: elif mode == InferenceMode.REPLAY:
# REPLAY mode requires recording to exist
raise RuntimeError( raise RuntimeError(
f"No recorded response found for request hash: {request_hash}\n" f"No recorded response found for request hash: {request_hash}\n"
f"Request: {method} {url} {body}\n" f"Request: {method} {url} {body}\n"
@ -304,7 +442,7 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
f"To record this response, run with LLAMA_STACK_TEST_INFERENCE_MODE=record" f"To record this response, run with LLAMA_STACK_TEST_INFERENCE_MODE=record"
) )
elif _current_mode == InferenceMode.RECORD: if mode == InferenceMode.RECORD or (mode == InferenceMode.RECORD_IF_MISSING and not recording):
if endpoint == "/v1/models": if endpoint == "/v1/models":
response = original_method(self, *args, **kwargs) response = original_method(self, *args, **kwargs)
else: else:
@ -335,7 +473,7 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
# Store the recording immediately # Store the recording immediately
response_data = {"body": chunks, "is_streaming": True} response_data = {"body": chunks, "is_streaming": True}
_current_storage.store_recording(request_hash, request_data, response_data) storage.store_recording(request_hash, request_data, response_data)
# Return a generator that replays the stored chunks # Return a generator that replays the stored chunks
async def replay_recorded_stream(): async def replay_recorded_stream():
@ -345,11 +483,11 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
return replay_recorded_stream() return replay_recorded_stream()
else: else:
response_data = {"body": response, "is_streaming": False} response_data = {"body": response, "is_streaming": False}
_current_storage.store_recording(request_hash, request_data, response_data) storage.store_recording(request_hash, request_data, response_data)
return response return response
else: else:
raise AssertionError(f"Invalid mode: {_current_mode}") raise AssertionError(f"Invalid mode: {mode}")
def patch_inference_clients(): def patch_inference_clients():
@ -490,9 +628,9 @@ def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Gen
try: try:
_current_mode = mode _current_mode = mode
if mode in ["record", "replay"]: if mode in ["record", "replay", "record-if-missing"]:
if storage_dir is None: if storage_dir is None:
raise ValueError("storage_dir is required for record and replay modes") raise ValueError("storage_dir is required for record, replay, and record-if-missing modes")
_current_storage = ResponseStorage(Path(storage_dir)) _current_storage = ResponseStorage(Path(storage_dir))
patch_inference_clients() patch_inference_clients()
@ -500,7 +638,7 @@ def inference_recording(mode: str, storage_dir: str | Path | None = None) -> Gen
finally: finally:
# Restore previous state # Restore previous state
if mode in ["record", "replay"]: if mode in ["record", "replay", "record-if-missing"]:
unpatch_inference_clients() unpatch_inference_clients()
_current_mode = prev_mode _current_mode = prev_mode

View file

@ -20,11 +20,11 @@
"framer-motion": "^12.23.12", "framer-motion": "^12.23.12",
"llama-stack-client": "^0.2.23", "llama-stack-client": "^0.2.23",
"lucide-react": "^0.542.0", "lucide-react": "^0.542.0",
"next": "15.5.3", "next": "15.5.4",
"next-auth": "^4.24.11", "next-auth": "^4.24.11",
"next-themes": "^0.4.6", "next-themes": "^0.4.6",
"react": "^19.0.0", "react": "^19.0.0",
"react-dom": "^19.1.1", "react-dom": "^19.2.0",
"react-markdown": "^10.1.0", "react-markdown": "^10.1.0",
"remark-gfm": "^4.0.1", "remark-gfm": "^4.0.1",
"remeda": "^2.32.0", "remeda": "^2.32.0",
@ -2279,9 +2279,9 @@
} }
}, },
"node_modules/@next/env": { "node_modules/@next/env": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/@next/env/-/env-15.5.3.tgz", "resolved": "https://registry.npmjs.org/@next/env/-/env-15.5.4.tgz",
"integrity": "sha512-RSEDTRqyihYXygx/OJXwvVupfr9m04+0vH8vyy0HfZ7keRto6VX9BbEk0J2PUk0VGy6YhklJUSrgForov5F9pw==", "integrity": "sha512-27SQhYp5QryzIT5uO8hq99C69eLQ7qkzkDPsk3N+GuS2XgOgoYEeOav7Pf8Tn4drECOVDsDg8oj+/DVy8qQL2A==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/@next/eslint-plugin-next": { "node_modules/@next/eslint-plugin-next": {
@ -2295,9 +2295,9 @@
} }
}, },
"node_modules/@next/swc-darwin-arm64": { "node_modules/@next/swc-darwin-arm64": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.5.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-15.5.4.tgz",
"integrity": "sha512-nzbHQo69+au9wJkGKTU9lP7PXv0d1J5ljFpvb+LnEomLtSbJkbZyEs6sbF3plQmiOB2l9OBtN2tNSvCH1nQ9Jg==", "integrity": "sha512-nopqz+Ov6uvorej8ndRX6HlxCYWCO3AHLfKK2TYvxoSB2scETOcfm/HSS3piPqc3A+MUgyHoqE6je4wnkjfrOA==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -2311,9 +2311,9 @@
} }
}, },
"node_modules/@next/swc-darwin-x64": { "node_modules/@next/swc-darwin-x64": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.5.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-15.5.4.tgz",
"integrity": "sha512-w83w4SkOOhekJOcA5HBvHyGzgV1W/XvOfpkrxIse4uPWhYTTRwtGEM4v/jiXwNSJvfRvah0H8/uTLBKRXlef8g==", "integrity": "sha512-QOTCFq8b09ghfjRJKfb68kU9k2K+2wsC4A67psOiMn849K9ZXgCSRQr0oVHfmKnoqCbEmQWG1f2h1T2vtJJ9mA==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -2327,9 +2327,9 @@
} }
}, },
"node_modules/@next/swc-linux-arm64-gnu": { "node_modules/@next/swc-linux-arm64-gnu": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.5.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-15.5.4.tgz",
"integrity": "sha512-+m7pfIs0/yvgVu26ieaKrifV8C8yiLe7jVp9SpcIzg7XmyyNE7toC1fy5IOQozmr6kWl/JONC51osih2RyoXRw==", "integrity": "sha512-eRD5zkts6jS3VfE/J0Kt1VxdFqTnMc3QgO5lFE5GKN3KDI/uUpSyK3CjQHmfEkYR4wCOl0R0XrsjpxfWEA++XA==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -2343,9 +2343,9 @@
} }
}, },
"node_modules/@next/swc-linux-arm64-musl": { "node_modules/@next/swc-linux-arm64-musl": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.5.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-15.5.4.tgz",
"integrity": "sha512-u3PEIzuguSenoZviZJahNLgCexGFhso5mxWCrrIMdvpZn6lkME5vc/ADZG8UUk5K1uWRy4hqSFECrON6UKQBbQ==", "integrity": "sha512-TOK7iTxmXFc45UrtKqWdZ1shfxuL4tnVAOuuJK4S88rX3oyVV4ZkLjtMT85wQkfBrOOvU55aLty+MV8xmcJR8A==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -2359,9 +2359,9 @@
} }
}, },
"node_modules/@next/swc-linux-x64-gnu": { "node_modules/@next/swc-linux-x64-gnu": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.5.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-15.5.4.tgz",
"integrity": "sha512-lDtOOScYDZxI2BENN9m0pfVPJDSuUkAD1YXSvlJF0DKwZt0WlA7T7o3wrcEr4Q+iHYGzEaVuZcsIbCps4K27sA==", "integrity": "sha512-7HKolaj+481FSW/5lL0BcTkA4Ueam9SPYWyN/ib/WGAFZf0DGAN8frNpNZYFHtM4ZstrHZS3LY3vrwlIQfsiMA==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -2375,9 +2375,9 @@
} }
}, },
"node_modules/@next/swc-linux-x64-musl": { "node_modules/@next/swc-linux-x64-musl": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.5.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-15.5.4.tgz",
"integrity": "sha512-9vWVUnsx9PrY2NwdVRJ4dUURAQ8Su0sLRPqcCCxtX5zIQUBES12eRVHq6b70bbfaVaxIDGJN2afHui0eDm+cLg==", "integrity": "sha512-nlQQ6nfgN0nCO/KuyEUwwOdwQIGjOs4WNMjEUtpIQJPR2NUfmGpW2wkJln1d4nJ7oUzd1g4GivH5GoEPBgfsdw==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -2391,9 +2391,9 @@
} }
}, },
"node_modules/@next/swc-win32-arm64-msvc": { "node_modules/@next/swc-win32-arm64-msvc": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.5.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-15.5.4.tgz",
"integrity": "sha512-1CU20FZzY9LFQigRi6jM45oJMU3KziA5/sSG+dXeVaTm661snQP6xu3ykGxxwU5sLG3sh14teO/IOEPVsQMRfA==", "integrity": "sha512-PcR2bN7FlM32XM6eumklmyWLLbu2vs+D7nJX8OAIoWy69Kef8mfiN4e8TUv2KohprwifdpFKPzIP1njuCjD0YA==",
"cpu": [ "cpu": [
"arm64" "arm64"
], ],
@ -2407,9 +2407,9 @@
} }
}, },
"node_modules/@next/swc-win32-x64-msvc": { "node_modules/@next/swc-win32-x64-msvc": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.5.3.tgz", "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-15.5.4.tgz",
"integrity": "sha512-JMoLAq3n3y5tKXPQwCK5c+6tmwkuFDa2XAxz8Wm4+IVthdBZdZGh+lmiLUHg9f9IDwIQpUjp+ysd6OkYTyZRZw==", "integrity": "sha512-1ur2tSHZj8Px/KMAthmuI9FMp/YFusMMGoRNJaRZMOlSkgvLjzosSdQI0cJAKogdHl3qXUQKL9MGaYvKwA7DXg==",
"cpu": [ "cpu": [
"x64" "x64"
], ],
@ -3995,22 +3995,22 @@
} }
}, },
"node_modules/@types/react": { "node_modules/@types/react": {
"version": "19.1.4", "version": "19.2.0",
"resolved": "https://registry.npmjs.org/@types/react/-/react-19.1.4.tgz", "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.0.tgz",
"integrity": "sha512-EB1yiiYdvySuIITtD5lhW4yPyJ31RkJkkDw794LaQYrxCSaQV/47y5o1FMC4zF9ZyjUjzJMZwbovEnT5yHTW6g==", "integrity": "sha512-1LOH8xovvsKsCBq1wnT4ntDUdCJKmnEakhsuoUSy6ExlHCkGP2hqnatagYTgFk6oeL0VU31u7SNjunPN+GchtA==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"csstype": "^3.0.2" "csstype": "^3.0.2"
} }
}, },
"node_modules/@types/react-dom": { "node_modules/@types/react-dom": {
"version": "19.1.9", "version": "19.2.0",
"resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.1.9.tgz", "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.2.0.tgz",
"integrity": "sha512-qXRuZaOsAdXKFyOhRBg6Lqqc0yay13vN7KrIg4L7N4aaHN68ma9OK3NE1BoDFgFOTfM7zg+3/8+2n8rLUH3OKQ==", "integrity": "sha512-brtBs0MnE9SMx7px208g39lRmC5uHZs96caOJfTjFcYSLHNamvaSMfJNagChVNkup2SdtOxKX1FDBkRSJe1ZAg==",
"devOptional": true, "devOptional": true,
"license": "MIT", "license": "MIT",
"peerDependencies": { "peerDependencies": {
"@types/react": "^19.0.0" "@types/react": "^19.2.0"
} }
}, },
"node_modules/@types/stack-utils": { "node_modules/@types/stack-utils": {
@ -11414,12 +11414,12 @@
} }
}, },
"node_modules/next": { "node_modules/next": {
"version": "15.5.3", "version": "15.5.4",
"resolved": "https://registry.npmjs.org/next/-/next-15.5.3.tgz", "resolved": "https://registry.npmjs.org/next/-/next-15.5.4.tgz",
"integrity": "sha512-r/liNAx16SQj4D+XH/oI1dlpv9tdKJ6cONYPwwcCC46f2NjpaRWY+EKCzULfgQYV6YKXjHBchff2IZBSlZmJNw==", "integrity": "sha512-xH4Yjhb82sFYQfY3vbkJfgSDgXvBB6a8xPs9i35k6oZJRoQRihZH+4s9Yo2qsWpzBmZ3lPXaJ2KPXLfkvW4LnA==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@next/env": "15.5.3", "@next/env": "15.5.4",
"@swc/helpers": "0.5.15", "@swc/helpers": "0.5.15",
"caniuse-lite": "^1.0.30001579", "caniuse-lite": "^1.0.30001579",
"postcss": "8.4.31", "postcss": "8.4.31",
@ -11432,14 +11432,14 @@
"node": "^18.18.0 || ^19.8.0 || >= 20.0.0" "node": "^18.18.0 || ^19.8.0 || >= 20.0.0"
}, },
"optionalDependencies": { "optionalDependencies": {
"@next/swc-darwin-arm64": "15.5.3", "@next/swc-darwin-arm64": "15.5.4",
"@next/swc-darwin-x64": "15.5.3", "@next/swc-darwin-x64": "15.5.4",
"@next/swc-linux-arm64-gnu": "15.5.3", "@next/swc-linux-arm64-gnu": "15.5.4",
"@next/swc-linux-arm64-musl": "15.5.3", "@next/swc-linux-arm64-musl": "15.5.4",
"@next/swc-linux-x64-gnu": "15.5.3", "@next/swc-linux-x64-gnu": "15.5.4",
"@next/swc-linux-x64-musl": "15.5.3", "@next/swc-linux-x64-musl": "15.5.4",
"@next/swc-win32-arm64-msvc": "15.5.3", "@next/swc-win32-arm64-msvc": "15.5.4",
"@next/swc-win32-x64-msvc": "15.5.3", "@next/swc-win32-x64-msvc": "15.5.4",
"sharp": "^0.34.3" "sharp": "^0.34.3"
}, },
"peerDependencies": { "peerDependencies": {
@ -12450,24 +12450,24 @@
} }
}, },
"node_modules/react": { "node_modules/react": {
"version": "19.1.1", "version": "19.2.0",
"resolved": "https://registry.npmjs.org/react/-/react-19.1.1.tgz", "resolved": "https://registry.npmjs.org/react/-/react-19.2.0.tgz",
"integrity": "sha512-w8nqGImo45dmMIfljjMwOGtbmC/mk4CMYhWIicdSflH91J9TyCyczcPFXJzrZ/ZXcgGRFeP6BU0BEJTw6tZdfQ==", "integrity": "sha512-tmbWg6W31tQLeB5cdIBOicJDJRR2KzXsV7uSK9iNfLWQ5bIZfxuPEHp7M8wiHyHnn0DD1i7w3Zmin0FtkrwoCQ==",
"license": "MIT", "license": "MIT",
"engines": { "engines": {
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/react-dom": { "node_modules/react-dom": {
"version": "19.1.1", "version": "19.2.0",
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.1.tgz", "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.0.tgz",
"integrity": "sha512-Dlq/5LAZgF0Gaz6yiqZCf6VCcZs1ghAJyrsu84Q/GT0gV+mCxbfmKNoGRKBYMJ8IEdGPqu49YWXD02GCknEDkw==", "integrity": "sha512-UlbRu4cAiGaIewkPyiRGJk0imDN2T3JjieT6spoL2UeSf5od4n5LB/mQ4ejmxhCFT1tYe8IvaFulzynWovsEFQ==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"scheduler": "^0.26.0" "scheduler": "^0.27.0"
}, },
"peerDependencies": { "peerDependencies": {
"react": "^19.1.1" "react": "^19.2.0"
} }
}, },
"node_modules/react-is": { "node_modules/react-is": {
@ -12982,9 +12982,9 @@
} }
}, },
"node_modules/scheduler": { "node_modules/scheduler": {
"version": "0.26.0", "version": "0.27.0",
"resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz", "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.27.0.tgz",
"integrity": "sha512-NlHwttCI/l5gCPR3D1nNXtWABUmBwvZpEQiD4IXSbIDq8BzLIK/7Ir5gTFSGZDUu37K5cMNp0hFtzO38sC7gWA==", "integrity": "sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q==",
"license": "MIT" "license": "MIT"
}, },
"node_modules/semver": { "node_modules/semver": {

View file

@ -25,11 +25,11 @@
"framer-motion": "^12.23.12", "framer-motion": "^12.23.12",
"llama-stack-client": "^0.2.23", "llama-stack-client": "^0.2.23",
"lucide-react": "^0.542.0", "lucide-react": "^0.542.0",
"next": "15.5.3", "next": "15.5.4",
"next-auth": "^4.24.11", "next-auth": "^4.24.11",
"next-themes": "^0.4.6", "next-themes": "^0.4.6",
"react": "^19.0.0", "react": "^19.0.0",
"react-dom": "^19.1.1", "react-dom": "^19.2.0",
"react-markdown": "^10.1.0", "react-markdown": "^10.1.0",
"remark-gfm": "^4.0.1", "remark-gfm": "^4.0.1",
"remeda": "^2.32.0", "remeda": "^2.32.0",

View file

@ -99,6 +99,7 @@ unit = [
"coverage", "coverage",
"chromadb>=1.0.15", "chromadb>=1.0.15",
"moto[s3]>=5.1.10", "moto[s3]>=5.1.10",
"weaviate-client>=4.16.4",
] ]
# These are the core dependencies required for running integration tests. They are shared across all # These are the core dependencies required for running integration tests. They are shared across all
# providers. If a provider requires additional dependencies, please add them to your environment # providers. If a provider requires additional dependencies, please add them to your environment
@ -277,14 +278,10 @@ exclude = [
"^llama_stack/providers/remote/datasetio/huggingface/", "^llama_stack/providers/remote/datasetio/huggingface/",
"^llama_stack/providers/remote/datasetio/nvidia/", "^llama_stack/providers/remote/datasetio/nvidia/",
"^llama_stack/providers/remote/inference/bedrock/", "^llama_stack/providers/remote/inference/bedrock/",
"^llama_stack/providers/remote/inference/cerebras/",
"^llama_stack/providers/remote/inference/databricks/",
"^llama_stack/providers/remote/inference/fireworks/",
"^llama_stack/providers/remote/inference/nvidia/", "^llama_stack/providers/remote/inference/nvidia/",
"^llama_stack/providers/remote/inference/passthrough/", "^llama_stack/providers/remote/inference/passthrough/",
"^llama_stack/providers/remote/inference/runpod/", "^llama_stack/providers/remote/inference/runpod/",
"^llama_stack/providers/remote/inference/tgi/", "^llama_stack/providers/remote/inference/tgi/",
"^llama_stack/providers/remote/inference/together/",
"^llama_stack/providers/remote/inference/watsonx/", "^llama_stack/providers/remote/inference/watsonx/",
"^llama_stack/providers/remote/safety/bedrock/", "^llama_stack/providers/remote/safety/bedrock/",
"^llama_stack/providers/remote/safety/nvidia/", "^llama_stack/providers/remote/safety/nvidia/",

120
scripts/normalize_recordings.py Executable file
View file

@ -0,0 +1,120 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
"""
Utility script to re-normalize existing recording files.
This script reads all recording JSON files and applies the normalization
to make IDs deterministic and timestamps constant. This reduces noise in
git diffs when recordings are re-recorded.
Usage:
python scripts/normalize_recordings.py [--dry-run]
"""
import argparse
import json
from pathlib import Path
def normalize_response_data(data: dict, request_hash: str) -> dict:
"""Normalize fields that change between recordings but don't affect functionality."""
# Only normalize ID for completion/chat responses, not for model objects
# Model objects have "object": "model" and the ID is the actual model identifier
if "id" in data and data.get("object") != "model":
data["id"] = f"rec-{request_hash[:12]}"
# Normalize timestamp to epoch (0) (for OpenAI-style responses)
# But not for model objects where created timestamp might be meaningful
if "created" in data and data.get("object") != "model":
data["created"] = 0
# Normalize Ollama-specific timestamp fields
if "created_at" in data:
data["created_at"] = "1970-01-01T00:00:00.000000Z"
# Normalize Ollama-specific duration fields (these vary based on system load)
if "total_duration" in data and data["total_duration"] is not None:
data["total_duration"] = 0
if "load_duration" in data and data["load_duration"] is not None:
data["load_duration"] = 0
if "prompt_eval_duration" in data and data["prompt_eval_duration"] is not None:
data["prompt_eval_duration"] = 0
if "eval_duration" in data and data["eval_duration"] is not None:
data["eval_duration"] = 0
return data
def normalize_recording_file(file_path: Path, dry_run: bool = False) -> bool:
"""Normalize a single recording file. Returns True if file was modified."""
with open(file_path) as f:
recording = json.load(f)
# Extract request hash from filename (first 12 chars)
request_hash = file_path.stem.split("-")[-1] if "-" in file_path.stem else file_path.stem
modified = False
old_recording = json.dumps(recording, sort_keys=True)
# NOTE: We do NOT normalize request body here because that would change the request hash
# and break recording lookups. The recorder will normalize tool_call_ids in future recordings.
# Normalize response body
if "response" in recording and "body" in recording["response"]:
body = recording["response"]["body"]
if isinstance(body, list):
# Handle streaming responses (list of chunks)
for chunk in body:
if isinstance(chunk, dict) and "__data__" in chunk:
normalize_response_data(chunk["__data__"], request_hash)
elif isinstance(body, dict) and "__data__" in body:
# Handle single response
normalize_response_data(body["__data__"], request_hash)
# Check if anything changed
new_recording = json.dumps(recording, sort_keys=True)
modified = old_recording != new_recording
if modified and not dry_run:
with open(file_path, "w") as f:
json.dump(recording, f, indent=2)
f.write("\n")
return modified
def main():
parser = argparse.ArgumentParser(description="Normalize recording files to reduce git diff noise")
parser.add_argument("--dry-run", action="store_true", help="Show what would be changed without modifying files")
args = parser.parse_args()
recordings_dir = Path(__file__).parent.parent / "tests/integration/recordings/responses"
if not recordings_dir.exists():
print(f"Recordings directory not found: {recordings_dir}")
return 1
modified_count = 0
total_count = 0
for file_path in sorted(recordings_dir.glob("*.json")):
total_count += 1
was_modified = normalize_recording_file(file_path, dry_run=args.dry_run)
if was_modified:
modified_count += 1
status = "[DRY RUN] Would normalize" if args.dry_run else "Normalized"
print(f"{status}: {file_path.name}")
print(f"\n{'[DRY RUN] ' if args.dry_run else ''}Summary: {modified_count}/{total_count} files modified")
return 0
if __name__ == "__main__":
exit(main())

View file

@ -0,0 +1,15 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
- name: Jaeger
type: jaeger
access: proxy
url: http://jaeger:16686
editable: true

View file

@ -0,0 +1,40 @@
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 1024
exporters:
# Export traces to Jaeger
otlp/jaeger:
endpoint: jaeger:4317
tls:
insecure: true
# Export metrics to Prometheus
prometheus:
endpoint: 0.0.0.0:9464
namespace: llama_stack
# Debug exporter for troubleshooting
debug:
verbosity: detailed
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [otlp/jaeger, debug]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [prometheus, debug]

View file

@ -0,0 +1,12 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'otel-collector'
static_configs:
- targets: ['otel-collector:9464']

View file

@ -17,6 +17,7 @@
set -Eeuo pipefail set -Eeuo pipefail
CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker} CONTAINER_RUNTIME=${CONTAINER_RUNTIME:-docker}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "🚀 Setting up telemetry stack for Llama Stack using Podman..." echo "🚀 Setting up telemetry stack for Llama Stack using Podman..."
@ -53,7 +54,7 @@ $CONTAINER_RUNTIME run -d --name otel-collector \
-p 4317:4317 \ -p 4317:4317 \
-p 9464:9464 \ -p 9464:9464 \
-p 13133:13133 \ -p 13133:13133 \
-v $(pwd)/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z \ -v "$SCRIPT_DIR/otel-collector-config.yaml:/etc/otel-collector-config.yaml:Z" \
docker.io/otel/opentelemetry-collector-contrib:latest \ docker.io/otel/opentelemetry-collector-contrib:latest \
--config /etc/otel-collector-config.yaml --config /etc/otel-collector-config.yaml
@ -62,7 +63,7 @@ echo "📈 Starting Prometheus..."
$CONTAINER_RUNTIME run -d --name prometheus \ $CONTAINER_RUNTIME run -d --name prometheus \
--network llama-telemetry \ --network llama-telemetry \
-p 9090:9090 \ -p 9090:9090 \
-v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml:Z \ -v "$SCRIPT_DIR/prometheus.yml:/etc/prometheus/prometheus.yml:Z" \
docker.io/prom/prometheus:latest \ docker.io/prom/prometheus:latest \
--config.file=/etc/prometheus/prometheus.yml \ --config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/prometheus \ --storage.tsdb.path=/prometheus \
@ -72,13 +73,15 @@ $CONTAINER_RUNTIME run -d --name prometheus \
--web.enable-lifecycle --web.enable-lifecycle
# Start Grafana # Start Grafana
# Note: Using 11.0.0 because grafana:latest arm64 image has a broken /run.sh (0 bytes)
echo "📊 Starting Grafana..." echo "📊 Starting Grafana..."
$CONTAINER_RUNTIME run -d --name grafana \ $CONTAINER_RUNTIME run -d --name grafana \
--network llama-telemetry \ --network llama-telemetry \
-p 3000:3000 \ -p 3000:3000 \
-e GF_SECURITY_ADMIN_PASSWORD=admin \ -e GF_SECURITY_ADMIN_PASSWORD=admin \
-e GF_USERS_ALLOW_SIGN_UP=false \ -e GF_USERS_ALLOW_SIGN_UP=false \
docker.io/grafana/grafana:latest -v "$SCRIPT_DIR/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
docker.io/grafana/grafana:11.0.0
# Wait for services to start # Wait for services to start
echo "⏳ Waiting for services to start..." echo "⏳ Waiting for services to start..."

View file

@ -125,21 +125,28 @@ pytest -s -v tests/integration/vector_io/ \
## Recording Modes ## Recording Modes
The testing system supports three modes controlled by environment variables: The testing system supports four modes controlled by environment variables:
### REPLAY Mode (Default) ### REPLAY Mode (Default)
Uses cached responses instead of making API calls: Uses cached responses instead of making API calls:
```bash ```bash
pytest tests/integration/ pytest tests/integration/
``` ```
### RECORD-IF-MISSING Mode (Recommended for adding new tests)
Records only when no recording exists, otherwise replays. This is the preferred mode for iterative development:
```bash
pytest tests/integration/inference/test_new_feature.py --inference-mode=record-if-missing
```
### RECORD Mode ### RECORD Mode
Captures API interactions for later replay: **Force-records all API interactions**, overwriting existing recordings. Use with caution as this will re-record everything:
```bash ```bash
pytest tests/integration/inference/test_new_feature.py --inference-mode=record pytest tests/integration/inference/test_new_feature.py --inference-mode=record
``` ```
### LIVE Mode ### LIVE Mode
Tests make real API calls (but not recorded): Tests make real API calls (not recorded):
```bash ```bash
pytest tests/integration/ --inference-mode=live pytest tests/integration/ --inference-mode=live
``` ```

View file

@ -0,0 +1,58 @@
{
"test_id": "tests/integration/agents/test_agents.py::test_custom_tool_infinite_loop[ollama/llama3.2:3b-instruct-fp16]",
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama-guard3:1b",
"messages": [
{
"role": "user",
"content": "Task: Check if there is unsafe content in 'Tool' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: Get the boiling point of polyjuice with a tool call.\n\nAssistant: \n\nTool: Error when running tool: get_boiling_point() missing 1 required positional argument: 'liquid_name'\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST Tool message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
}
],
"stream": false,
"temperature": 0.0
},
"endpoint": "/v1/chat/completions",
"model": "llama-guard3:1b"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "rec-000506671ad4",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "safe",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 0,
"model": "llama-guard3:1b",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": {
"completion_tokens": 2,
"prompt_tokens": 422,
"total_tokens": 424,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

View file

@ -28,7 +28,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -43,7 +43,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -54,7 +54,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -69,7 +69,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -80,7 +80,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -95,7 +95,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -106,7 +106,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -121,7 +121,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -132,7 +132,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -147,7 +147,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -158,7 +158,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -173,7 +173,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -184,7 +184,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -199,7 +199,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -210,7 +210,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -225,7 +225,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -236,7 +236,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -251,7 +251,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -262,7 +262,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -277,7 +277,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -288,7 +288,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -303,7 +303,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -314,7 +314,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -329,7 +329,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -340,7 +340,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -355,7 +355,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -366,7 +366,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -381,7 +381,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -392,7 +392,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -407,7 +407,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -418,7 +418,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -433,7 +433,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -444,7 +444,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -459,7 +459,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -470,7 +470,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -485,7 +485,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437810, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -496,7 +496,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -511,7 +511,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437811, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -522,7 +522,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-130", "id": "rec-044dcd8fdeb1",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -537,7 +537,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759437811, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,

View file

@ -0,0 +1,58 @@
{
"test_id": "tests/integration/agents/test_agents.py::test_custom_tool[ollama/llama3.2:3b-instruct-fp16]",
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama-guard3:1b",
"messages": [
{
"role": "user",
"content": "Task: Check if there is unsafe content in 'Tool' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: What is the boiling point of the liquid polyjuice in celsius?\n\nAssistant: \n\nTool: Error when running tool: get_boiling_point() got an unexpected keyword argument 'liquid'\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST Tool message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
}
],
"stream": false,
"temperature": 0.0
},
"endpoint": "/v1/chat/completions",
"model": "llama-guard3:1b"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "rec-06fbbb88ed5e",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "safe",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 0,
"model": "llama-guard3:1b",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": {
"completion_tokens": 2,
"prompt_tokens": 421,
"total_tokens": 423,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

View file

@ -73,7 +73,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -88,7 +88,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -99,7 +99,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -114,7 +114,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -125,7 +125,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -140,7 +140,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -151,7 +151,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -166,7 +166,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -177,7 +177,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -192,7 +192,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -203,7 +203,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -218,7 +218,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -229,7 +229,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -244,7 +244,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -255,7 +255,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -270,7 +270,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -281,7 +281,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -296,7 +296,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -307,7 +307,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -322,7 +322,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -333,7 +333,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -348,7 +348,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -359,7 +359,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -374,7 +374,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441160, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,
@ -385,7 +385,7 @@
{ {
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk", "__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": { "__data__": {
"id": "chatcmpl-67", "id": "rec-4a32ce3da3ce",
"choices": [ "choices": [
{ {
"delta": { "delta": {
@ -400,7 +400,7 @@
"logprobs": null "logprobs": null
} }
], ],
"created": 1759441161, "created": 0,
"model": "llama3.2:3b-instruct-fp16", "model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
"service_tier": null, "service_tier": null,

View file

@ -21,7 +21,7 @@
"body": { "body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion", "__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": { "__data__": {
"id": "chatcmpl-912", "id": "rec-b58e35a624b0",
"choices": [ "choices": [
{ {
"finish_reason": "stop", "finish_reason": "stop",
@ -38,7 +38,7 @@
} }
} }
], ],
"created": 1759437811, "created": 0,
"model": "llama-guard3:1b", "model": "llama-guard3:1b",
"object": "chat.completion", "object": "chat.completion",
"service_tier": null, "service_tier": null,

View file

@ -0,0 +1,104 @@
{
"test_id": "tests/integration/agents/test_agents.py::test_create_turn_response[ollama/llama3.2:3b-instruct-fp16-client_tools1]",
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama3.2:3b-instruct-fp16",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Call get_boiling_point_with_metadata tool and answer What is the boiling point of polyjuice?"
}
],
"max_tokens": 512,
"stream": true,
"temperature": 0.0001,
"tool_choice": "auto",
"tools": [
{
"type": "function",
"function": {
"name": "get_boiling_point_with_metadata",
"description": "Returns the boiling point of a liquid in Celcius or Fahrenheit"
}
}
],
"top_p": 0.9
},
"endpoint": "/v1/chat/completions",
"model": "llama3.2:3b-instruct-fp16"
},
"response": {
"body": [
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-176bcef706a9",
"choices": [
{
"delta": {
"content": "",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "call_wxinam9c",
"function": {
"arguments": "{}",
"name": "get_boiling_point_with_metadata"
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-176bcef706a9",
"choices": [
{
"delta": {
"content": "",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": "tool_calls",
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
}
],
"is_streaming": true
}
}

View file

@ -0,0 +1,58 @@
{
"test_id": "tests/integration/agents/test_agents.py::test_tool_choice_none[ollama/llama3.2:3b-instruct-fp16]",
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama-guard3:1b",
"messages": [
{
"role": "user",
"content": "Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes.\nS2: Non-Violent Crimes.\nS3: Sex Crimes.\nS4: Child Exploitation.\nS5: Defamation.\nS6: Specialized Advice.\nS7: Privacy.\nS8: Intellectual Property.\nS9: Indiscriminate Weapons.\nS10: Hate.\nS11: Self-Harm.\nS12: Sexual Content.\nS13: Elections.\nS14: Code Interpreter Abuse.\n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\nUser: What is the boiling point of the liquid polyjuice in celsius?\n\n<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST User message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories."
}
],
"stream": false,
"temperature": 0.0
},
"endpoint": "/v1/chat/completions",
"model": "llama-guard3:1b"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "rec-1a0d3109cf92",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "safe",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 0,
"model": "llama-guard3:1b",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": {
"completion_tokens": 2,
"prompt_tokens": 398,
"total_tokens": 400,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

View file

@ -0,0 +1,388 @@
{
"request": {
"method": "POST",
"url": "http://0.0.0.0:11434/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "llama3.2:3b-instruct-fp16",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": "Call get_boiling_point tool and answer What is the boiling point of polyjuice?"
},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "toolcall-1d82e943-0",
"type": "function",
"function": {
"name": "get_boiling_point",
"arguments": "{\"celcius\":null,\"liquid_name\":\"polyjuice\"}"
}
}
]
},
{
"role": "tool",
"tool_call_id": "toolcall-1d82e943-0",
"content": "-212"
}
],
"max_tokens": 512,
"stream": true,
"temperature": 0.0001,
"tool_choice": "auto",
"tools": [
{
"type": "function",
"function": {
"name": "get_boiling_point",
"description": "Returns the boiling point of a liquid in Celcius or Fahrenheit.",
"parameters": {
"type": "object",
"properties": {
"liquid_name": {
"type": "string",
"description": "The name of the liquid"
},
"celcius": {
"type": "boolean",
"description": "Whether to return the boiling point in Celcius"
}
},
"required": [
"liquid_name"
]
}
}
}
],
"top_p": 0.9
},
"endpoint": "/v1/chat/completions",
"model": "llama3.2:3b-instruct-fp16"
},
"response": {
"body": [
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": "The",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": " boiling",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": " point",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": " of",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": " poly",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": "ju",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": "ice",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": " is",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": " -",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": "212",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": ".",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "rec-1d82e9439ae3",
"choices": [
{
"delta": {
"content": "",
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": null
},
"finish_reason": "stop",
"index": 0,
"logprobs": null
}
],
"created": 0,
"model": "llama3.2:3b-instruct-fp16",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "fp_ollama",
"usage": null
}
}
],
"is_streaming": true
}
}

Some files were not shown because too many files have changed in this diff Show more