Merge branch 'llamastack:main' into llama_stack_how_to_documentation

This commit is contained in:
Omar Abdelwahab 2025-10-08 12:44:24 -07:00 committed by GitHub
commit c83343de84
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
1030 changed files with 249861 additions and 104196 deletions

2
.github/CODEOWNERS vendored
View file

@ -2,4 +2,4 @@
# These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence,
* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1
* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo

1
.github/TRIAGERS.md vendored
View file

@ -1,2 +1 @@
# This file documents Triage members in the Llama Stack community
@franciscojavierarceo

View file

@ -85,8 +85,8 @@ jobs:
cat $run_dir/run.yaml
# avoid line breaks in the server log, especially because we grep it below.
export COLUMNS=1984
nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
export LLAMA_STACK_LOG_WIDTH=200
nohup uv run llama stack run $run_dir/run.yaml > server.log 2>&1 &
- name: Wait for Llama Stack server to be ready
run: |

View file

@ -42,18 +42,27 @@ jobs:
run-replay-mode-tests:
runs-on: ubuntu-latest
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.setup, matrix.python-version, matrix.client-version, matrix.suite) }}
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}
strategy:
fail-fast: false
matrix:
client-type: [library, server]
# Use vllm on weekly schedule, otherwise use test-setup input (defaults to ollama)
setup: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-setup || 'ollama')) }}
# Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
suite: [base, vision]
# Define (setup, suite) pairs - they are always matched and cannot be independent
# Weekly schedule (Sun 1 AM): vllm+base
# Input test-setup=ollama-vision: ollama-vision+vision
# Default (including test-setup=ollama): both ollama+base and ollama-vision+vision
config: >-
${{
github.event.schedule == '1 0 * * 0'
&& fromJSON('[{"setup": "vllm", "suite": "base"}]')
|| github.event.inputs.test-setup == 'ollama-vision'
&& fromJSON('[{"setup": "ollama-vision", "suite": "vision"}]')
|| fromJSON('[{"setup": "ollama", "suite": "base"}, {"setup": "ollama-vision", "suite": "vision"}]')
}}
steps:
- name: Checkout repository
@ -64,14 +73,14 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
client-version: ${{ matrix.client-version }}
setup: ${{ matrix.setup }}
suite: ${{ matrix.suite }}
setup: ${{ matrix.config.setup }}
suite: ${{ matrix.config.suite }}
inference-mode: 'replay'
- name: Run tests
uses: ./.github/actions/run-and-record-tests
with:
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || 'server:ci-tests' }}
setup: ${{ matrix.setup }}
setup: ${{ matrix.config.setup }}
inference-mode: 'replay'
suite: ${{ matrix.suite }}
suite: ${{ matrix.config.suite }}

View file

@ -18,7 +18,7 @@ jobs:
steps:
- name: Check comment author and get PR details
id: check_author
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
@ -78,7 +78,7 @@ jobs:
- name: React to comment
if: steps.check_author.outputs.authorized == 'true'
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
@ -91,7 +91,7 @@ jobs:
- name: Comment starting
if: steps.check_author.outputs.authorized == 'true'
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
@ -189,7 +189,7 @@ jobs:
- name: Comment success with changes
if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'true'
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
@ -202,7 +202,7 @@ jobs:
- name: Comment success without changes
if: steps.check_author.outputs.authorized == 'true' && steps.changes.outputs.has_changes == 'false' && steps.precommit.outcome == 'success'
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
@ -215,7 +215,7 @@ jobs:
- name: Comment failure
if: failure()
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |

View file

@ -112,7 +112,7 @@ jobs:
fi
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
echo "Entrypoint is not correct"
exit 1
fi
@ -150,7 +150,7 @@ jobs:
fi
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[python -m llama_stack.core.server.server /app/run.yaml]" ]; then
if [ "$entrypoint" != "[llama stack run /app/run.yaml]" ]; then
echo "Entrypoint is not correct"
exit 1
fi

View file

@ -24,7 +24,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Install uv
uses: astral-sh/setup-uv@b75a909f75acd358c2196fb9a5f1299a9a8868a4 # v6.7.0
uses: astral-sh/setup-uv@d0cc045d04ccac9d8b7881df0226f9e82c39688e # v6.8.0
with:
python-version: ${{ matrix.python-version }}
activate-environment: true

View file

@ -24,7 +24,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Stale Action
uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
with:
stale-issue-label: 'stale'
stale-issue-message: >

View file

@ -59,7 +59,7 @@ jobs:
# Use the virtual environment created by the build step (name comes from build config)
source ramalama-stack-test/bin/activate
uv pip list
nohup llama stack run tests/external/ramalama-stack/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
nohup llama stack run tests/external/ramalama-stack/run.yaml > server.log 2>&1 &
- name: Wait for Llama Stack server to be ready
run: |

View file

@ -59,7 +59,7 @@ jobs:
# Use the virtual environment created by the build step (name comes from build config)
source ci-test/bin/activate
uv pip list
nohup llama stack run tests/external/run-byoa.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
nohup llama stack run tests/external/run-byoa.yaml > server.log 2>&1 &
- name: Wait for Llama Stack server to be ready
run: |

View file

@ -7,7 +7,7 @@
[![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
[![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
[**Quick Start**](https://llamastack.github.io/latest/getting_started/index.html) | [**Documentation**](https://llamastack.github.io/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
[**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
### ✨🎉 Llama 4 Support 🎉✨

View file

@ -52,7 +52,7 @@ You can access the HuggingFace trainer via the `starter` distribution:
```bash
llama stack build --distro starter --image-type venv
llama stack run --image-type venv ~/.llama/distributions/starter/starter-run.yaml
llama stack run ~/.llama/distributions/starter/starter-run.yaml
```
### Usage Example

View file

@ -187,21 +187,21 @@ Configure telemetry behavior using environment variables:
- **`OTEL_SERVICE_NAME`**: Service name for telemetry (default: empty string)
- **`TELEMETRY_SINKS`**: Comma-separated list of sinks (default: `console,sqlite`)
## Visualization with Jaeger
### Quick Setup: Complete Telemetry Stack
The `otel_trace` sink works with any service compatible with the OpenTelemetry collector. Traces and metrics use separate endpoints but can share the same collector.
### Starting Jaeger
Start a Jaeger instance with OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686:
Use the automated setup script to launch the complete telemetry stack (Jaeger, OpenTelemetry Collector, Prometheus, and Grafana):
```bash
docker run --pull always --rm --name jaeger \
-p 16686:16686 -p 4318:4318 \
jaegertracing/jaeger:2.1.0
./scripts/telemetry/setup_telemetry.sh
```
Once running, you can visualize traces by navigating to [http://localhost:16686/](http://localhost:16686/).
This sets up:
- **Jaeger UI**: http://localhost:16686 (traces visualization)
- **Prometheus**: http://localhost:9090 (metrics)
- **Grafana**: http://localhost:3000 (dashboards with auto-configured data sources)
- **OTEL Collector**: http://localhost:4318 (OTLP endpoint)
Once running, you can visualize traces by navigating to [Grafana](http://localhost:3000/) and login with login `admin` and password `admin`.
## Querying Metrics

View file

@ -219,13 +219,10 @@ group_tools = client.tools.list_tools(toolgroup_id="search_tools")
<TabItem value="setup" label="Setup & Configuration">
1. Start by registering a Tavily API key at [Tavily](https://tavily.com/).
2. [Optional] Provide the API key directly to the Llama Stack server
2. [Optional] Set the API key in your environment before starting the Llama Stack server
```bash
export TAVILY_SEARCH_API_KEY="your key"
```
```bash
--env TAVILY_SEARCH_API_KEY=${TAVILY_SEARCH_API_KEY}
```
</TabItem>
<TabItem value="implementation" label="Implementation">
@ -273,9 +270,9 @@ for log in EventLogger().log(response):
<TabItem value="setup" label="Setup & Configuration">
1. Start by registering for a WolframAlpha API key at [WolframAlpha Developer Portal](https://developer.wolframalpha.com/access).
2. Provide the API key either when starting the Llama Stack server:
2. Provide the API key either by setting it in your environment before starting the Llama Stack server:
```bash
--env WOLFRAM_ALPHA_API_KEY=${WOLFRAM_ALPHA_API_KEY}
export WOLFRAM_ALPHA_API_KEY="your key"
```
or from the client side:
```python

View file

@ -357,7 +357,7 @@ server:
8. Run the server:
```bash
python -m llama_stack.core.server.server --yaml-config ~/.llama/run-byoa.yaml
llama stack run ~/.llama/run-byoa.yaml
```
9. Test the API:

View file

@ -76,7 +76,7 @@ Integration tests are located in [tests/integration](https://github.com/meta-lla
Consult [tests/integration/README.md](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more details on how to run the tests.
Note that each provider's `sample_run_config()` method (in the configuration class for that provider)
typically references some environment variables for specifying API keys and the like. You can set these in the environment or pass these via the `--env` flag to the test command.
typically references some environment variables for specifying API keys and the like. You can set these in the environment before running the test command.
### 2. Unit Testing

View file

@ -170,7 +170,7 @@ spec:
- name: llama-stack
image: localhost/llama-stack-run-k8s:latest
imagePullPolicy: IfNotPresent
command: ["python", "-m", "llama_stack.core.server.server", "--config", "/app/config.yaml"]
command: ["llama", "stack", "run", "/app/config.yaml"]
ports:
- containerPort: 5000
volumeMounts:

View file

@ -289,10 +289,10 @@ After this step is successful, you should be able to find the built container im
docker run -d \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-e INFERENCE_MODEL=$INFERENCE_MODEL \
-e OLLAMA_URL=http://host.docker.internal:11434 \
localhost/distribution-ollama:dev \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434
--port $LLAMA_STACK_PORT
```
Here are the docker flags and their uses:
@ -305,12 +305,12 @@ Here are the docker flags and their uses:
* `localhost/distribution-ollama:dev`: The name and tag of the container image to run
* `-e INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the INFERENCE_MODEL environment variable in the container
* `-e OLLAMA_URL=http://host.docker.internal:11434`: Sets the OLLAMA_URL environment variable in the container
* `--port $LLAMA_STACK_PORT`: Port number for the server to listen on
* `--env INFERENCE_MODEL=$INFERENCE_MODEL`: Sets the model to use for inference
* `--env OLLAMA_URL=http://host.docker.internal:11434`: Configures the URL for the Ollama service
</TabItem>
</Tabs>
@ -320,23 +320,22 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
```
llama stack run -h
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--env KEY=VALUE]
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME]
[--image-type {venv}] [--enable-ui]
[config | template]
[config | distro]
Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
positional arguments:
config | template Path to config file to use for the run or name of known template (`llama stack list` for a list). (default: None)
config | distro Path to config file to use for the run or name of known distro (`llama stack list` for a list). (default: None)
options:
-h, --help show this help message and exit
--port PORT Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
--image-name IMAGE_NAME
Name of the image to run. Defaults to the current environment (default: None)
--env KEY=VALUE Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: None)
[DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
--image-type {venv}
Image Type used during the build. This should be venv. (default: None)
[DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running. (default: None)
--enable-ui Start the UI server (default: False)
```
@ -348,9 +347,6 @@ llama stack run tgi
# Start using config file
llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
# Start using a venv
llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
```
```

View file

@ -101,7 +101,7 @@ A few things to note:
- The id is a string you can choose freely.
- You can instantiate any number of provider instances of the same type.
- The configuration dictionary is provider-specific.
- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.
- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server, you can set environment variables in your shell before running `llama stack run` to override the default values.
### Environment Variable Substitution
@ -173,13 +173,10 @@ optional_token: ${env.OPTIONAL_TOKEN:+}
#### Runtime Override
You can override environment variables at runtime when starting the server:
You can override environment variables at runtime by setting them in your shell before starting the server:
```bash
# Override specific environment variables
llama stack run --config run.yaml --env API_KEY=sk-123 --env BASE_URL=https://custom-api.com
# Or set them in your shell
# Set environment variables in your shell
export API_KEY=sk-123
export BASE_URL=https://custom-api.com
llama stack run --config run.yaml

View file

@ -52,7 +52,7 @@ spec:
value: "${SAFETY_MODEL}"
- name: TAVILY_SEARCH_API_KEY
value: "${TAVILY_SEARCH_API_KEY}"
command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"]
command: ["llama", "stack", "run", "/etc/config/stack_run_config.yaml", "--port", "8321"]
ports:
- containerPort: 8321
volumeMounts:

View file

@ -69,10 +69,10 @@ docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
-e WATSONX_API_KEY=$WATSONX_API_KEY \
-e WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
-e WATSONX_BASE_URL=$WATSONX_BASE_URL \
llamastack/distribution-watsonx \
--config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env WATSONX_API_KEY=$WATSONX_API_KEY \
--env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \
--env WATSONX_BASE_URL=$WATSONX_BASE_URL
--port $LLAMA_STACK_PORT
```

View file

@ -129,11 +129,11 @@ docker run -it \
# NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed
-v $HOME/git/llama-stack:/app/llama-stack-source -v $HOME/git/llama-models:/app/llama-models-source \
# localhost/distribution-dell:dev if building / testing locally
llamastack/distribution-dell\
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env CHROMA_URL=$CHROMA_URL
-e INFERENCE_MODEL=$INFERENCE_MODEL \
-e DEH_URL=$DEH_URL \
-e CHROMA_URL=$CHROMA_URL \
llamastack/distribution-dell \
--port $LLAMA_STACK_PORT
```
@ -154,14 +154,14 @@ docker run \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v $HOME/.llama:/root/.llama \
-v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-run.yaml \
-e INFERENCE_MODEL=$INFERENCE_MODEL \
-e DEH_URL=$DEH_URL \
-e SAFETY_MODEL=$SAFETY_MODEL \
-e DEH_SAFETY_URL=$DEH_SAFETY_URL \
-e CHROMA_URL=$CHROMA_URL \
llamastack/distribution-dell \
--config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env DEH_SAFETY_URL=$DEH_SAFETY_URL \
--env CHROMA_URL=$CHROMA_URL
--port $LLAMA_STACK_PORT
```
### Via venv
@ -170,21 +170,21 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash
llama stack build --distro dell --image-type venv
llama stack run dell
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env CHROMA_URL=$CHROMA_URL
INFERENCE_MODEL=$INFERENCE_MODEL \
DEH_URL=$DEH_URL \
CHROMA_URL=$CHROMA_URL \
llama stack run dell \
--port $LLAMA_STACK_PORT
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
INFERENCE_MODEL=$INFERENCE_MODEL \
DEH_URL=$DEH_URL \
SAFETY_MODEL=$SAFETY_MODEL \
DEH_SAFETY_URL=$DEH_SAFETY_URL \
CHROMA_URL=$CHROMA_URL \
llama stack run ./run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env DEH_SAFETY_URL=$DEH_SAFETY_URL \
--env CHROMA_URL=$CHROMA_URL
--port $LLAMA_STACK_PORT
```

View file

@ -84,9 +84,9 @@ docker run \
--gpu all \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
llamastack/distribution-meta-reference-gpu \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
--port $LLAMA_STACK_PORT
```
If you are using Llama Stack Safety / Shield APIs, use:
@ -98,10 +98,10 @@ docker run \
--gpu all \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
llamastack/distribution-meta-reference-gpu \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
--port $LLAMA_STACK_PORT
```
### Via venv
@ -110,16 +110,16 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
```bash
llama stack build --distro meta-reference-gpu --image-type venv
INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
llama stack run distributions/meta-reference-gpu/run.yaml \
--port 8321 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
--port 8321
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
--port 8321 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
--port 8321
```

View file

@ -129,10 +129,10 @@ docker run \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
-e NVIDIA_API_KEY=$NVIDIA_API_KEY \
llamastack/distribution-nvidia \
--config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
--port $LLAMA_STACK_PORT
```
### Via venv
@ -142,10 +142,10 @@ If you've set up your local development environment, you can also build the imag
```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
llama stack build --distro nvidia --image-type venv
NVIDIA_API_KEY=$NVIDIA_API_KEY \
INFERENCE_MODEL=$INFERENCE_MODEL \
llama stack run ./run.yaml \
--port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--env INFERENCE_MODEL=$INFERENCE_MODEL
--port 8321
```
## Example Notebooks

View file

@ -86,9 +86,9 @@ docker run -it \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-e OLLAMA_URL=http://host.docker.internal:11434 \
llamastack/distribution-starter \
--port $LLAMA_STACK_PORT \
--env OLLAMA_URL=http://host.docker.internal:11434
--port $LLAMA_STACK_PORT
```
Note to start the container with Podman, you can do the same but replace `docker` at the start of the command with
`podman`. If you are using `podman` older than `4.7.0`, please also replace `host.docker.internal` in the `OLLAMA_URL`
@ -106,9 +106,9 @@ docker run -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
--network=host \
-e OLLAMA_URL=http://localhost:11434 \
llamastack/distribution-starter \
--port $LLAMA_STACK_PORT \
--env OLLAMA_URL=http://localhost:11434
--port $LLAMA_STACK_PORT
```
:::
You will see output like below:

View file

@ -1,4 +1,7 @@
---
description: "Files
This API is used to upload documents that can be used with other Llama Stack APIs."
sidebar_label: Files
title: Files
---
@ -7,4 +10,8 @@ title: Files
## Overview
Files
This API is used to upload documents that can be used with other Llama Stack APIs.
This section contains documentation for all available providers for the **files** API.

View file

@ -1,5 +1,7 @@
---
description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
description: "Inference
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
@ -12,7 +14,9 @@ title: Inference
## Overview
Llama Stack Inference API for generating completions, chat completions, and embeddings.
Inference
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions.

View file

@ -15,6 +15,7 @@ Anthropic inference provider for accessing Claude models and Anthropic's AI serv
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `str \| None` | No | | API key for Anthropic models |
## Sample Configuration

View file

@ -22,6 +22,7 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `<class 'pydantic.types.SecretStr'>` | No | | Azure API key for Azure |
| `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
| `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) |

View file

@ -15,6 +15,7 @@ AWS Bedrock inference provider for accessing various AI models through AWS's man
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `aws_access_key_id` | `str \| None` | No | | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
| `aws_secret_access_key` | `str \| None` | No | | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
| `aws_session_token` | `str \| None` | No | | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |

View file

@ -15,6 +15,7 @@ Cerebras inference provider for running models on Cerebras Cloud platform.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `base_url` | `<class 'str'>` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
| `api_key` | `<class 'pydantic.types.SecretStr'>` | No | | Cerebras API Key |

View file

@ -15,7 +15,8 @@ Databricks inference provider for running models on Databricks' unified analytic
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `url` | `<class 'str'>` | No | | The URL for the Databricks model serving endpoint |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `str \| None` | No | | The URL for the Databricks model serving endpoint |
| `api_token` | `<class 'pydantic.types.SecretStr'>` | No | | The Databricks API token |
## Sample Configuration

View file

@ -15,6 +15,7 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The Fireworks.ai API Key |

View file

@ -15,6 +15,7 @@ Google Gemini inference provider for accessing Gemini models and Google's AI ser
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `str \| None` | No | | API key for Gemini models |
## Sample Configuration

View file

@ -15,6 +15,7 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `str \| None` | No | | The Groq API key |
| `url` | `<class 'str'>` | No | https://api.groq.com | The URL for the Groq AI server |

View file

@ -15,6 +15,7 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `str \| None` | No | | The Llama API key |
| `openai_compat_api_base` | `<class 'str'>` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |

View file

@ -15,6 +15,7 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The NVIDIA API key, only needed of using the hosted service |
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |

View file

@ -15,8 +15,8 @@ Ollama inference provider for running local models through the Ollama runtime.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | http://localhost:11434 | |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |
## Sample Configuration

View file

@ -15,6 +15,7 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `str \| None` | No | | API key for OpenAI models |
| `base_url` | `<class 'str'>` | No | https://api.openai.com/v1 | Base URL for OpenAI API |

View file

@ -15,6 +15,7 @@ Passthrough inference provider for connecting to any external inference service
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | | The URL for the passthrough endpoint |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | API Key for the passthrouth endpoint |

View file

@ -15,6 +15,7 @@ RunPod inference provider for running models on RunPod's cloud GPU platform.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `str \| None` | No | | The URL for the Runpod model serving endpoint |
| `api_token` | `str \| None` | No | | The API token |

View file

@ -15,6 +15,7 @@ SambaNova inference provider for running models on SambaNova's dataflow architec
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The SambaNova cloud API Key |

View file

@ -15,6 +15,7 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | | The URL for the TGI serving endpoint |
## Sample Configuration

View file

@ -15,6 +15,7 @@ Together AI inference provider for open-source models and collaborative AI devel
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The Together AI API Key |

View file

@ -54,6 +54,7 @@ Available Models:
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `project` | `<class 'str'>` | No | | Google Cloud project ID for Vertex AI |
| `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |

View file

@ -15,11 +15,11 @@ Remote vLLM inference provider for connecting to vLLM servers.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `str \| None` | No | | The URL for the vLLM model serving endpoint |
| `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
| `api_token` | `str \| None` | No | fake | The API token |
| `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically |
## Sample Configuration

View file

@ -15,9 +15,10 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx API key |
| `project_id` | `str \| None` | No | | The Project ID key |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The watsonx.ai API key |
| `project_id` | `str \| None` | No | | The watsonx.ai project ID |
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
## Sample Configuration

View file

@ -1,4 +1,7 @@
---
description: "Safety
OpenAI-compatible Moderations API."
sidebar_label: Safety
title: Safety
---
@ -7,4 +10,8 @@ title: Safety
## Overview
Safety
OpenAI-compatible Moderations API.
This section contains documentation for all available providers for the **safety** API.

View file

@ -15,6 +15,7 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `aws_access_key_id` | `str \| None` | No | | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
| `aws_secret_access_key` | `str \| None` | No | | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
| `aws_session_token` | `str \| None` | No | | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |

View file

@ -123,12 +123,12 @@
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
"\n",
"# this command installs all the dependencies needed for the llama stack server with the together inference provider\n",
"!uv run --with llama-stack llama stack build --distro together --image-type venv\n",
"!uv run --with llama-stack llama stack build --distro together\n",
"\n",
"def run_llama_stack_server_background():\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n",
" \"uv run --with llama-stack llama stack run together --image-type venv\",\n",
" \"uv run --with llama-stack llama stack run together\",\n",
" shell=True,\n",
" stdout=log_file,\n",
" stderr=log_file,\n",

View file

@ -233,12 +233,12 @@
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
"\n",
"# this command installs all the dependencies needed for the llama stack server\n",
"!uv run --with llama-stack llama stack build --distro meta-reference-gpu --image-type venv\n",
"!uv run --with llama-stack llama stack build --distro meta-reference-gpu\n",
"\n",
"def run_llama_stack_server_background():\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n",
" f\"uv run --with llama-stack llama stack run meta-reference-gpu --image-type venv --env INFERENCE_MODEL={model_id}\",\n",
" f\"INFERENCE_MODEL={model_id} uv run --with llama-stack llama stack run meta-reference-gpu\",\n",
" shell=True,\n",
" stdout=log_file,\n",
" stderr=log_file,\n",

View file

@ -223,12 +223,12 @@
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
"\n",
"# this command installs all the dependencies needed for the llama stack server\n",
"!uv run --with llama-stack llama stack build --distro llama_api --image-type venv\n",
"!uv run --with llama-stack llama stack build --distro llama_api\n",
"\n",
"def run_llama_stack_server_background():\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n",
" \"uv run --with llama-stack llama stack run llama_api --image-type venv\",\n",
" \"uv run --with llama-stack llama stack run llama_api\",\n",
" shell=True,\n",
" stdout=log_file,\n",
" stderr=log_file,\n",

View file

@ -145,12 +145,12 @@
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
"\n",
"# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
"!uv run --with llama-stack llama stack build --distro starter --image-type venv\n",
"!uv run --with llama-stack llama stack build --distro starter\n",
"\n",
"def run_llama_stack_server_background():\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n",
" f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter --image-type venv\n",
" f\"OLLAMA_URL=http://localhost:11434 uv run --with llama-stack llama stack run starter\n",
" shell=True,\n",
" stdout=log_file,\n",
" stderr=log_file,\n",

View file

@ -1443,8 +1443,8 @@
"tags": [
"Inference"
],
"summary": "List all chat completions.",
"description": "List all chat completions.",
"summary": "List chat completions.",
"description": "List chat completions.",
"parameters": [
{
"name": "after",
@ -1520,8 +1520,8 @@
"tags": [
"Inference"
],
"summary": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
"description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
"summary": "Create chat completions.",
"description": "Create chat completions.\nGenerate an OpenAI-compatible chat completion for the given messages using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@ -1565,8 +1565,8 @@
"tags": [
"Inference"
],
"summary": "Describe a chat completion by its ID.",
"description": "Describe a chat completion by its ID.",
"summary": "Get chat completion.",
"description": "Get chat completion.\nDescribe a chat completion by its ID.",
"parameters": [
{
"name": "completion_id",
@ -1610,8 +1610,8 @@
"tags": [
"Inference"
],
"summary": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
"description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
"summary": "Create completion.",
"description": "Create completion.\nGenerate an OpenAI-compatible completion for the given prompt using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@ -1655,8 +1655,8 @@
"tags": [
"Inference"
],
"summary": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
"description": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
"summary": "Create embeddings.",
"description": "Create embeddings.\nGenerate OpenAI-compatible embeddings for the given input using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@ -1700,8 +1700,8 @@
"tags": [
"Files"
],
"summary": "Returns a list of files that belong to the user's organization.",
"description": "Returns a list of files that belong to the user's organization.",
"summary": "List files.",
"description": "List files.\nReturns a list of files that belong to the user's organization.",
"parameters": [
{
"name": "after",
@ -1770,8 +1770,8 @@
"tags": [
"Files"
],
"summary": "Upload a file that can be used across various endpoints.",
"description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
"summary": "Upload file.",
"description": "Upload file.\nUpload a file that can be used across various endpoints.\n\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
"parameters": [],
"requestBody": {
"content": {
@ -1831,8 +1831,8 @@
"tags": [
"Files"
],
"summary": "Returns information about a specific file.",
"description": "Returns information about a specific file.",
"summary": "Retrieve file.",
"description": "Retrieve file.\nReturns information about a specific file.",
"parameters": [
{
"name": "file_id",
@ -1874,8 +1874,8 @@
"tags": [
"Files"
],
"summary": "Delete a file.",
"description": "Delete a file.",
"summary": "Delete file.",
"description": "Delete file.",
"parameters": [
{
"name": "file_id",
@ -1919,8 +1919,8 @@
"tags": [
"Files"
],
"summary": "Returns the contents of the specified file.",
"description": "Returns the contents of the specified file.",
"summary": "Retrieve file content.",
"description": "Retrieve file content.\nReturns the contents of the specified file.",
"parameters": [
{
"name": "file_id",
@ -1999,8 +1999,8 @@
"tags": [
"Safety"
],
"summary": "Classifies if text and/or image inputs are potentially harmful.",
"description": "Classifies if text and/or image inputs are potentially harmful.",
"summary": "Create moderation.",
"description": "Create moderation.\nClassifies if text and/or image inputs are potentially harmful.",
"parameters": [],
"requestBody": {
"content": {
@ -2044,8 +2044,8 @@
"tags": [
"Agents"
],
"summary": "List all OpenAI responses.",
"description": "List all OpenAI responses.",
"summary": "List all responses.",
"description": "List all responses.",
"parameters": [
{
"name": "after",
@ -2119,8 +2119,8 @@
"tags": [
"Agents"
],
"summary": "Create a new OpenAI response.",
"description": "Create a new OpenAI response.",
"summary": "Create a model response.",
"description": "Create a model response.",
"parameters": [],
"requestBody": {
"content": {
@ -2184,8 +2184,8 @@
"tags": [
"Agents"
],
"summary": "Retrieve an OpenAI response by its ID.",
"description": "Retrieve an OpenAI response by its ID.",
"summary": "Get a model response.",
"description": "Get a model response.",
"parameters": [
{
"name": "response_id",
@ -2227,8 +2227,8 @@
"tags": [
"Agents"
],
"summary": "Delete an OpenAI response by its ID.",
"description": "Delete an OpenAI response by its ID.",
"summary": "Delete a response.",
"description": "Delete a response.",
"parameters": [
{
"name": "response_id",
@ -2272,8 +2272,8 @@
"tags": [
"Agents"
],
"summary": "List input items for a given OpenAI response.",
"description": "List input items for a given OpenAI response.",
"summary": "List input items.",
"description": "List input items.",
"parameters": [
{
"name": "response_id",
@ -13366,12 +13366,13 @@
},
{
"name": "Files",
"description": ""
"description": "This API is used to upload documents that can be used with other Llama Stack APIs.",
"x-displayName": "Files"
},
{
"name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
"x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings."
"description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
"x-displayName": "Inference"
},
{
"name": "Models",
@ -13383,7 +13384,8 @@
},
{
"name": "Safety",
"description": ""
"description": "OpenAI-compatible Moderations API.",
"x-displayName": "Safety"
},
{
"name": "Telemetry",

View file

@ -1033,8 +1033,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: List all chat completions.
description: List all chat completions.
summary: List chat completions.
description: List chat completions.
parameters:
- name: after
in: query
@ -1087,10 +1087,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
summary: Create chat completions.
description: >-
Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
parameters: []
@ -1122,8 +1122,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: Describe a chat completion by its ID.
description: Describe a chat completion by its ID.
summary: Get chat completion.
description: >-
Get chat completion.
Describe a chat completion by its ID.
parameters:
- name: completion_id
in: path
@ -1153,10 +1156,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
summary: Create completion.
description: >-
Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
parameters: []
@ -1189,10 +1192,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate OpenAI-compatible embeddings for the given input using the specified
model.
summary: Create embeddings.
description: >-
Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified
model.
parameters: []
@ -1225,9 +1228,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Returns a list of files that belong to the user's organization.
summary: List files.
description: >-
List files.
Returns a list of files that belong to the user's organization.
parameters:
- name: after
@ -1285,11 +1289,13 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Upload a file that can be used across various endpoints.
summary: Upload file.
description: >-
Upload file.
Upload a file that can be used across various endpoints.
The file upload should be a multipart form request with:
- file: The File object (not file name) to be uploaded.
@ -1338,9 +1344,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Returns information about a specific file.
summary: Retrieve file.
description: >-
Retrieve file.
Returns information about a specific file.
parameters:
- name: file_id
@ -1372,8 +1379,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: Delete a file.
description: Delete a file.
summary: Delete file.
description: Delete file.
parameters:
- name: file_id
in: path
@ -1405,9 +1412,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Returns the contents of the specified file.
summary: Retrieve file content.
description: >-
Retrieve file content.
Returns the contents of the specified file.
parameters:
- name: file_id
@ -1464,9 +1472,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Safety
summary: >-
Classifies if text and/or image inputs are potentially harmful.
summary: Create moderation.
description: >-
Create moderation.
Classifies if text and/or image inputs are potentially harmful.
parameters: []
requestBody:
@ -1497,8 +1506,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: List all OpenAI responses.
description: List all OpenAI responses.
summary: List all responses.
description: List all responses.
parameters:
- name: after
in: query
@ -1549,8 +1558,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: Create a new OpenAI response.
description: Create a new OpenAI response.
summary: Create a model response.
description: Create a model response.
parameters: []
requestBody:
content:
@ -1592,8 +1601,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: Retrieve an OpenAI response by its ID.
description: Retrieve an OpenAI response by its ID.
summary: Get a model response.
description: Get a model response.
parameters:
- name: response_id
in: path
@ -1623,8 +1632,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: Delete an OpenAI response by its ID.
description: Delete an OpenAI response by its ID.
summary: Delete a response.
description: Delete a response.
parameters:
- name: response_id
in: path
@ -1654,10 +1663,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: >-
List input items for a given OpenAI response.
description: >-
List input items for a given OpenAI response.
summary: List input items.
description: List input items.
parameters:
- name: response_id
in: path
@ -10011,9 +10018,16 @@ tags:
x-displayName: >-
Llama Stack Evaluation API for running evaluations on model and agent candidates.
- name: Files
description: ''
description: >-
This API is used to upload documents that can be used with other Llama Stack
APIs.
x-displayName: Files
- name: Inference
description: >-
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
This API provides the raw interface to the underlying models. Two kinds of models
are supported:
@ -10021,15 +10035,14 @@ tags:
- Embedding models: these models generate embeddings to be used for semantic
search.
x-displayName: >-
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
x-displayName: Inference
- name: Models
description: ''
- name: PostTraining (Coming Soon)
description: ''
- name: Safety
description: ''
description: OpenAI-compatible Moderations API.
x-displayName: Safety
- name: Telemetry
description: ''
- name: VectorIO

View file

@ -69,8 +69,8 @@
"tags": [
"Inference"
],
"summary": "List all chat completions.",
"description": "List all chat completions.",
"summary": "List chat completions.",
"description": "List chat completions.",
"parameters": [
{
"name": "after",
@ -146,8 +146,8 @@
"tags": [
"Inference"
],
"summary": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
"description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
"summary": "Create chat completions.",
"description": "Create chat completions.\nGenerate an OpenAI-compatible chat completion for the given messages using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@ -191,8 +191,8 @@
"tags": [
"Inference"
],
"summary": "Describe a chat completion by its ID.",
"description": "Describe a chat completion by its ID.",
"summary": "Get chat completion.",
"description": "Get chat completion.\nDescribe a chat completion by its ID.",
"parameters": [
{
"name": "completion_id",
@ -236,8 +236,8 @@
"tags": [
"Inference"
],
"summary": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
"description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
"summary": "Create completion.",
"description": "Create completion.\nGenerate an OpenAI-compatible completion for the given prompt using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@ -758,8 +758,8 @@
"tags": [
"Inference"
],
"summary": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
"description": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
"summary": "Create embeddings.",
"description": "Create embeddings.\nGenerate OpenAI-compatible embeddings for the given input using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@ -803,8 +803,8 @@
"tags": [
"Files"
],
"summary": "Returns a list of files that belong to the user's organization.",
"description": "Returns a list of files that belong to the user's organization.",
"summary": "List files.",
"description": "List files.\nReturns a list of files that belong to the user's organization.",
"parameters": [
{
"name": "after",
@ -873,8 +873,8 @@
"tags": [
"Files"
],
"summary": "Upload a file that can be used across various endpoints.",
"description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
"summary": "Upload file.",
"description": "Upload file.\nUpload a file that can be used across various endpoints.\n\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
"parameters": [],
"requestBody": {
"content": {
@ -934,8 +934,8 @@
"tags": [
"Files"
],
"summary": "Returns information about a specific file.",
"description": "Returns information about a specific file.",
"summary": "Retrieve file.",
"description": "Retrieve file.\nReturns information about a specific file.",
"parameters": [
{
"name": "file_id",
@ -977,8 +977,8 @@
"tags": [
"Files"
],
"summary": "Delete a file.",
"description": "Delete a file.",
"summary": "Delete file.",
"description": "Delete file.",
"parameters": [
{
"name": "file_id",
@ -1022,8 +1022,8 @@
"tags": [
"Files"
],
"summary": "Returns the contents of the specified file.",
"description": "Returns the contents of the specified file.",
"summary": "Retrieve file content.",
"description": "Retrieve file content.\nReturns the contents of the specified file.",
"parameters": [
{
"name": "file_id",
@ -1067,8 +1067,8 @@
"tags": [
"Inspect"
],
"summary": "Get the current health status of the service.",
"description": "Get the current health status of the service.",
"summary": "Get health status.",
"description": "Get health status.\nGet the current health status of the service.",
"parameters": [],
"deprecated": false
}
@ -1102,8 +1102,8 @@
"tags": [
"Inspect"
],
"summary": "List all available API routes with their methods and implementing providers.",
"description": "List all available API routes with their methods and implementing providers.",
"summary": "List routes.",
"description": "List routes.\nList all available API routes with their methods and implementing providers.",
"parameters": [],
"deprecated": false
}
@ -1170,8 +1170,8 @@
"tags": [
"Models"
],
"summary": "Register a model.",
"description": "Register a model.",
"summary": "Register model.",
"description": "Register model.\nRegister a model.",
"parameters": [],
"requestBody": {
"content": {
@ -1215,8 +1215,8 @@
"tags": [
"Models"
],
"summary": "Get a model by its identifier.",
"description": "Get a model by its identifier.",
"summary": "Get model.",
"description": "Get model.\nGet a model by its identifier.",
"parameters": [
{
"name": "model_id",
@ -1251,8 +1251,8 @@
"tags": [
"Models"
],
"summary": "Unregister a model.",
"description": "Unregister a model.",
"summary": "Unregister model.",
"description": "Unregister model.\nUnregister a model.",
"parameters": [
{
"name": "model_id",
@ -1296,8 +1296,8 @@
"tags": [
"Safety"
],
"summary": "Classifies if text and/or image inputs are potentially harmful.",
"description": "Classifies if text and/or image inputs are potentially harmful.",
"summary": "Create moderation.",
"description": "Create moderation.\nClassifies if text and/or image inputs are potentially harmful.",
"parameters": [],
"requestBody": {
"content": {
@ -1374,8 +1374,8 @@
"tags": [
"Prompts"
],
"summary": "Create a new prompt.",
"description": "Create a new prompt.",
"summary": "Create prompt.",
"description": "Create prompt.\nCreate a new prompt.",
"parameters": [],
"requestBody": {
"content": {
@ -1419,8 +1419,8 @@
"tags": [
"Prompts"
],
"summary": "Get a prompt by its identifier and optional version.",
"description": "Get a prompt by its identifier and optional version.",
"summary": "Get prompt.",
"description": "Get prompt.\nGet a prompt by its identifier and optional version.",
"parameters": [
{
"name": "prompt_id",
@ -1471,8 +1471,8 @@
"tags": [
"Prompts"
],
"summary": "Update an existing prompt (increments version).",
"description": "Update an existing prompt (increments version).",
"summary": "Update prompt.",
"description": "Update prompt.\nUpdate an existing prompt (increments version).",
"parameters": [
{
"name": "prompt_id",
@ -1517,8 +1517,8 @@
"tags": [
"Prompts"
],
"summary": "Delete a prompt.",
"description": "Delete a prompt.",
"summary": "Delete prompt.",
"description": "Delete prompt.\nDelete a prompt.",
"parameters": [
{
"name": "prompt_id",
@ -1562,8 +1562,8 @@
"tags": [
"Prompts"
],
"summary": "Set which version of a prompt should be the default in get_prompt (latest).",
"description": "Set which version of a prompt should be the default in get_prompt (latest).",
"summary": "Set prompt version.",
"description": "Set prompt version.\nSet which version of a prompt should be the default in get_prompt (latest).",
"parameters": [
{
"name": "prompt_id",
@ -1617,8 +1617,8 @@
"tags": [
"Prompts"
],
"summary": "List all versions of a specific prompt.",
"description": "List all versions of a specific prompt.",
"summary": "List prompt versions.",
"description": "List prompt versions.\nList all versions of a specific prompt.",
"parameters": [
{
"name": "prompt_id",
@ -1662,8 +1662,8 @@
"tags": [
"Providers"
],
"summary": "List all available providers.",
"description": "List all available providers.",
"summary": "List providers.",
"description": "List providers.\nList all available providers.",
"parameters": [],
"deprecated": false
}
@ -1697,8 +1697,8 @@
"tags": [
"Providers"
],
"summary": "Get detailed information about a specific provider.",
"description": "Get detailed information about a specific provider.",
"summary": "Get provider.",
"description": "Get provider.\nGet detailed information about a specific provider.",
"parameters": [
{
"name": "provider_id",
@ -1742,8 +1742,8 @@
"tags": [
"Agents"
],
"summary": "List all OpenAI responses.",
"description": "List all OpenAI responses.",
"summary": "List all responses.",
"description": "List all responses.",
"parameters": [
{
"name": "after",
@ -1817,8 +1817,8 @@
"tags": [
"Agents"
],
"summary": "Create a new OpenAI response.",
"description": "Create a new OpenAI response.",
"summary": "Create a model response.",
"description": "Create a model response.",
"parameters": [],
"requestBody": {
"content": {
@ -1882,8 +1882,8 @@
"tags": [
"Agents"
],
"summary": "Retrieve an OpenAI response by its ID.",
"description": "Retrieve an OpenAI response by its ID.",
"summary": "Get a model response.",
"description": "Get a model response.",
"parameters": [
{
"name": "response_id",
@ -1925,8 +1925,8 @@
"tags": [
"Agents"
],
"summary": "Delete an OpenAI response by its ID.",
"description": "Delete an OpenAI response by its ID.",
"summary": "Delete a response.",
"description": "Delete a response.",
"parameters": [
{
"name": "response_id",
@ -1970,8 +1970,8 @@
"tags": [
"Agents"
],
"summary": "List input items for a given OpenAI response.",
"description": "List input items for a given OpenAI response.",
"summary": "List input items.",
"description": "List input items.",
"parameters": [
{
"name": "response_id",
@ -2063,8 +2063,8 @@
"tags": [
"Safety"
],
"summary": "Run a shield.",
"description": "Run a shield.",
"summary": "Run shield.",
"description": "Run shield.\nRun a shield.",
"parameters": [],
"requestBody": {
"content": {
@ -4196,8 +4196,8 @@
"tags": [
"Inspect"
],
"summary": "Get the version of the service.",
"description": "Get the version of the service.",
"summary": "Get version.",
"description": "Get version.\nGet the version of the service.",
"parameters": [],
"deprecated": false
}
@ -12914,16 +12914,18 @@
},
{
"name": "Files",
"description": ""
"description": "This API is used to upload documents that can be used with other Llama Stack APIs.",
"x-displayName": "Files"
},
{
"name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
"x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings."
"description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
"x-displayName": "Inference"
},
{
"name": "Inspect",
"description": ""
"description": "APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.",
"x-displayName": "Inspect"
},
{
"name": "Models",
@ -12931,17 +12933,18 @@
},
{
"name": "Prompts",
"description": "",
"x-displayName": "Protocol for prompt management operations."
"description": "Protocol for prompt management operations.",
"x-displayName": "Prompts"
},
{
"name": "Providers",
"description": "",
"x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
"description": "Providers API for inspecting, listing, and modifying providers and their configurations.",
"x-displayName": "Providers"
},
{
"name": "Safety",
"description": ""
"description": "OpenAI-compatible Moderations API.",
"x-displayName": "Safety"
},
{
"name": "Scoring",

View file

@ -33,8 +33,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: List all chat completions.
description: List all chat completions.
summary: List chat completions.
description: List chat completions.
parameters:
- name: after
in: query
@ -87,10 +87,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
summary: Create chat completions.
description: >-
Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
parameters: []
@ -122,8 +122,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: Describe a chat completion by its ID.
description: Describe a chat completion by its ID.
summary: Get chat completion.
description: >-
Get chat completion.
Describe a chat completion by its ID.
parameters:
- name: completion_id
in: path
@ -153,10 +156,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
summary: Create completion.
description: >-
Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
parameters: []
@ -603,10 +606,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate OpenAI-compatible embeddings for the given input using the specified
model.
summary: Create embeddings.
description: >-
Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified
model.
parameters: []
@ -639,9 +642,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Returns a list of files that belong to the user's organization.
summary: List files.
description: >-
List files.
Returns a list of files that belong to the user's organization.
parameters:
- name: after
@ -699,11 +703,13 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Upload a file that can be used across various endpoints.
summary: Upload file.
description: >-
Upload file.
Upload a file that can be used across various endpoints.
The file upload should be a multipart form request with:
- file: The File object (not file name) to be uploaded.
@ -752,9 +758,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Returns information about a specific file.
summary: Retrieve file.
description: >-
Retrieve file.
Returns information about a specific file.
parameters:
- name: file_id
@ -786,8 +793,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: Delete a file.
description: Delete a file.
summary: Delete file.
description: Delete file.
parameters:
- name: file_id
in: path
@ -819,9 +826,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Returns the contents of the specified file.
summary: Retrieve file content.
description: >-
Retrieve file content.
Returns the contents of the specified file.
parameters:
- name: file_id
@ -854,9 +862,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inspect
summary: >-
Get the current health status of the service.
summary: Get health status.
description: >-
Get health status.
Get the current health status of the service.
parameters: []
deprecated: false
@ -882,9 +891,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inspect
summary: >-
List all available API routes with their methods and implementing providers.
summary: List routes.
description: >-
List routes.
List all available API routes with their methods and implementing providers.
parameters: []
deprecated: false
@ -933,8 +943,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Models
summary: Register a model.
description: Register a model.
summary: Register model.
description: >-
Register model.
Register a model.
parameters: []
requestBody:
content:
@ -964,8 +977,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Models
summary: Get a model by its identifier.
description: Get a model by its identifier.
summary: Get model.
description: >-
Get model.
Get a model by its identifier.
parameters:
- name: model_id
in: path
@ -990,8 +1006,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Models
summary: Unregister a model.
description: Unregister a model.
summary: Unregister model.
description: >-
Unregister model.
Unregister a model.
parameters:
- name: model_id
in: path
@ -1022,9 +1041,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Safety
summary: >-
Classifies if text and/or image inputs are potentially harmful.
summary: Create moderation.
description: >-
Create moderation.
Classifies if text and/or image inputs are potentially harmful.
parameters: []
requestBody:
@ -1080,8 +1100,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: Create a new prompt.
description: Create a new prompt.
summary: Create prompt.
description: >-
Create prompt.
Create a new prompt.
parameters: []
requestBody:
content:
@ -1111,9 +1134,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: >-
Get a prompt by its identifier and optional version.
summary: Get prompt.
description: >-
Get prompt.
Get a prompt by its identifier and optional version.
parameters:
- name: prompt_id
@ -1151,9 +1175,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: >-
Update an existing prompt (increments version).
summary: Update prompt.
description: >-
Update prompt.
Update an existing prompt (increments version).
parameters:
- name: prompt_id
@ -1185,8 +1210,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: Delete a prompt.
description: Delete a prompt.
summary: Delete prompt.
description: >-
Delete prompt.
Delete a prompt.
parameters:
- name: prompt_id
in: path
@ -1217,9 +1245,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: >-
Set which version of a prompt should be the default in get_prompt (latest).
summary: Set prompt version.
description: >-
Set prompt version.
Set which version of a prompt should be the default in get_prompt (latest).
parameters:
- name: prompt_id
@ -1257,8 +1286,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: List all versions of a specific prompt.
description: List all versions of a specific prompt.
summary: List prompt versions.
description: >-
List prompt versions.
List all versions of a specific prompt.
parameters:
- name: prompt_id
in: path
@ -1290,8 +1322,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Providers
summary: List all available providers.
description: List all available providers.
summary: List providers.
description: >-
List providers.
List all available providers.
parameters: []
deprecated: false
/v1/providers/{provider_id}:
@ -1316,9 +1351,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Providers
summary: >-
Get detailed information about a specific provider.
summary: Get provider.
description: >-
Get provider.
Get detailed information about a specific provider.
parameters:
- name: provider_id
@ -1349,8 +1385,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: List all OpenAI responses.
description: List all OpenAI responses.
summary: List all responses.
description: List all responses.
parameters:
- name: after
in: query
@ -1401,8 +1437,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: Create a new OpenAI response.
description: Create a new OpenAI response.
summary: Create a model response.
description: Create a model response.
parameters: []
requestBody:
content:
@ -1444,8 +1480,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: Retrieve an OpenAI response by its ID.
description: Retrieve an OpenAI response by its ID.
summary: Get a model response.
description: Get a model response.
parameters:
- name: response_id
in: path
@ -1475,8 +1511,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: Delete an OpenAI response by its ID.
description: Delete an OpenAI response by its ID.
summary: Delete a response.
description: Delete a response.
parameters:
- name: response_id
in: path
@ -1506,10 +1542,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: >-
List input items for a given OpenAI response.
description: >-
List input items for a given OpenAI response.
summary: List input items.
description: List input items.
parameters:
- name: response_id
in: path
@ -1578,8 +1612,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Safety
summary: Run a shield.
description: Run a shield.
summary: Run shield.
description: >-
Run shield.
Run a shield.
parameters: []
requestBody:
content:
@ -3135,8 +3172,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inspect
summary: Get the version of the service.
description: Get the version of the service.
summary: Get version.
description: >-
Get version.
Get the version of the service.
parameters: []
deprecated: false
jsonSchemaDialect: >-
@ -9749,9 +9789,16 @@ tags:
x-displayName: >-
Protocol for conversation management operations.
- name: Files
description: ''
description: >-
This API is used to upload documents that can be used with other Llama Stack
APIs.
x-displayName: Files
- name: Inference
description: >-
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
This API provides the raw interface to the underlying models. Two kinds of models
are supported:
@ -9759,23 +9806,25 @@ tags:
- Embedding models: these models generate embeddings to be used for semantic
search.
x-displayName: >-
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
x-displayName: Inference
- name: Inspect
description: ''
description: >-
APIs for inspecting the Llama Stack service, including health status, available
API routes with methods and implementing providers.
x-displayName: Inspect
- name: Models
description: ''
- name: Prompts
description: ''
x-displayName: >-
description: >-
Protocol for prompt management operations.
x-displayName: Prompts
- name: Providers
description: ''
x-displayName: >-
description: >-
Providers API for inspecting, listing, and modifying providers and their configurations.
x-displayName: Providers
- name: Safety
description: ''
description: OpenAI-compatible Moderations API.
x-displayName: Safety
- name: Scoring
description: ''
- name: ScoringFunctions

View file

@ -69,8 +69,8 @@
"tags": [
"Inference"
],
"summary": "List all chat completions.",
"description": "List all chat completions.",
"summary": "List chat completions.",
"description": "List chat completions.",
"parameters": [
{
"name": "after",
@ -146,8 +146,8 @@
"tags": [
"Inference"
],
"summary": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
"description": "Generate an OpenAI-compatible chat completion for the given messages using the specified model.",
"summary": "Create chat completions.",
"description": "Create chat completions.\nGenerate an OpenAI-compatible chat completion for the given messages using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@ -191,8 +191,8 @@
"tags": [
"Inference"
],
"summary": "Describe a chat completion by its ID.",
"description": "Describe a chat completion by its ID.",
"summary": "Get chat completion.",
"description": "Get chat completion.\nDescribe a chat completion by its ID.",
"parameters": [
{
"name": "completion_id",
@ -236,8 +236,8 @@
"tags": [
"Inference"
],
"summary": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
"description": "Generate an OpenAI-compatible completion for the given prompt using the specified model.",
"summary": "Create completion.",
"description": "Create completion.\nGenerate an OpenAI-compatible completion for the given prompt using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@ -758,8 +758,8 @@
"tags": [
"Inference"
],
"summary": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
"description": "Generate OpenAI-compatible embeddings for the given input using the specified model.",
"summary": "Create embeddings.",
"description": "Create embeddings.\nGenerate OpenAI-compatible embeddings for the given input using the specified model.",
"parameters": [],
"requestBody": {
"content": {
@ -803,8 +803,8 @@
"tags": [
"Files"
],
"summary": "Returns a list of files that belong to the user's organization.",
"description": "Returns a list of files that belong to the user's organization.",
"summary": "List files.",
"description": "List files.\nReturns a list of files that belong to the user's organization.",
"parameters": [
{
"name": "after",
@ -873,8 +873,8 @@
"tags": [
"Files"
],
"summary": "Upload a file that can be used across various endpoints.",
"description": "Upload a file that can be used across various endpoints.\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
"summary": "Upload file.",
"description": "Upload file.\nUpload a file that can be used across various endpoints.\n\nThe file upload should be a multipart form request with:\n- file: The File object (not file name) to be uploaded.\n- purpose: The intended purpose of the uploaded file.\n- expires_after: Optional form values describing expiration for the file.",
"parameters": [],
"requestBody": {
"content": {
@ -934,8 +934,8 @@
"tags": [
"Files"
],
"summary": "Returns information about a specific file.",
"description": "Returns information about a specific file.",
"summary": "Retrieve file.",
"description": "Retrieve file.\nReturns information about a specific file.",
"parameters": [
{
"name": "file_id",
@ -977,8 +977,8 @@
"tags": [
"Files"
],
"summary": "Delete a file.",
"description": "Delete a file.",
"summary": "Delete file.",
"description": "Delete file.",
"parameters": [
{
"name": "file_id",
@ -1022,8 +1022,8 @@
"tags": [
"Files"
],
"summary": "Returns the contents of the specified file.",
"description": "Returns the contents of the specified file.",
"summary": "Retrieve file content.",
"description": "Retrieve file content.\nReturns the contents of the specified file.",
"parameters": [
{
"name": "file_id",
@ -1067,8 +1067,8 @@
"tags": [
"Inspect"
],
"summary": "Get the current health status of the service.",
"description": "Get the current health status of the service.",
"summary": "Get health status.",
"description": "Get health status.\nGet the current health status of the service.",
"parameters": [],
"deprecated": false
}
@ -1102,8 +1102,8 @@
"tags": [
"Inspect"
],
"summary": "List all available API routes with their methods and implementing providers.",
"description": "List all available API routes with their methods and implementing providers.",
"summary": "List routes.",
"description": "List routes.\nList all available API routes with their methods and implementing providers.",
"parameters": [],
"deprecated": false
}
@ -1170,8 +1170,8 @@
"tags": [
"Models"
],
"summary": "Register a model.",
"description": "Register a model.",
"summary": "Register model.",
"description": "Register model.\nRegister a model.",
"parameters": [],
"requestBody": {
"content": {
@ -1215,8 +1215,8 @@
"tags": [
"Models"
],
"summary": "Get a model by its identifier.",
"description": "Get a model by its identifier.",
"summary": "Get model.",
"description": "Get model.\nGet a model by its identifier.",
"parameters": [
{
"name": "model_id",
@ -1251,8 +1251,8 @@
"tags": [
"Models"
],
"summary": "Unregister a model.",
"description": "Unregister a model.",
"summary": "Unregister model.",
"description": "Unregister model.\nUnregister a model.",
"parameters": [
{
"name": "model_id",
@ -1296,8 +1296,8 @@
"tags": [
"Safety"
],
"summary": "Classifies if text and/or image inputs are potentially harmful.",
"description": "Classifies if text and/or image inputs are potentially harmful.",
"summary": "Create moderation.",
"description": "Create moderation.\nClassifies if text and/or image inputs are potentially harmful.",
"parameters": [],
"requestBody": {
"content": {
@ -1374,8 +1374,8 @@
"tags": [
"Prompts"
],
"summary": "Create a new prompt.",
"description": "Create a new prompt.",
"summary": "Create prompt.",
"description": "Create prompt.\nCreate a new prompt.",
"parameters": [],
"requestBody": {
"content": {
@ -1419,8 +1419,8 @@
"tags": [
"Prompts"
],
"summary": "Get a prompt by its identifier and optional version.",
"description": "Get a prompt by its identifier and optional version.",
"summary": "Get prompt.",
"description": "Get prompt.\nGet a prompt by its identifier and optional version.",
"parameters": [
{
"name": "prompt_id",
@ -1471,8 +1471,8 @@
"tags": [
"Prompts"
],
"summary": "Update an existing prompt (increments version).",
"description": "Update an existing prompt (increments version).",
"summary": "Update prompt.",
"description": "Update prompt.\nUpdate an existing prompt (increments version).",
"parameters": [
{
"name": "prompt_id",
@ -1517,8 +1517,8 @@
"tags": [
"Prompts"
],
"summary": "Delete a prompt.",
"description": "Delete a prompt.",
"summary": "Delete prompt.",
"description": "Delete prompt.\nDelete a prompt.",
"parameters": [
{
"name": "prompt_id",
@ -1562,8 +1562,8 @@
"tags": [
"Prompts"
],
"summary": "Set which version of a prompt should be the default in get_prompt (latest).",
"description": "Set which version of a prompt should be the default in get_prompt (latest).",
"summary": "Set prompt version.",
"description": "Set prompt version.\nSet which version of a prompt should be the default in get_prompt (latest).",
"parameters": [
{
"name": "prompt_id",
@ -1617,8 +1617,8 @@
"tags": [
"Prompts"
],
"summary": "List all versions of a specific prompt.",
"description": "List all versions of a specific prompt.",
"summary": "List prompt versions.",
"description": "List prompt versions.\nList all versions of a specific prompt.",
"parameters": [
{
"name": "prompt_id",
@ -1662,8 +1662,8 @@
"tags": [
"Providers"
],
"summary": "List all available providers.",
"description": "List all available providers.",
"summary": "List providers.",
"description": "List providers.\nList all available providers.",
"parameters": [],
"deprecated": false
}
@ -1697,8 +1697,8 @@
"tags": [
"Providers"
],
"summary": "Get detailed information about a specific provider.",
"description": "Get detailed information about a specific provider.",
"summary": "Get provider.",
"description": "Get provider.\nGet detailed information about a specific provider.",
"parameters": [
{
"name": "provider_id",
@ -1742,8 +1742,8 @@
"tags": [
"Agents"
],
"summary": "List all OpenAI responses.",
"description": "List all OpenAI responses.",
"summary": "List all responses.",
"description": "List all responses.",
"parameters": [
{
"name": "after",
@ -1817,8 +1817,8 @@
"tags": [
"Agents"
],
"summary": "Create a new OpenAI response.",
"description": "Create a new OpenAI response.",
"summary": "Create a model response.",
"description": "Create a model response.",
"parameters": [],
"requestBody": {
"content": {
@ -1882,8 +1882,8 @@
"tags": [
"Agents"
],
"summary": "Retrieve an OpenAI response by its ID.",
"description": "Retrieve an OpenAI response by its ID.",
"summary": "Get a model response.",
"description": "Get a model response.",
"parameters": [
{
"name": "response_id",
@ -1925,8 +1925,8 @@
"tags": [
"Agents"
],
"summary": "Delete an OpenAI response by its ID.",
"description": "Delete an OpenAI response by its ID.",
"summary": "Delete a response.",
"description": "Delete a response.",
"parameters": [
{
"name": "response_id",
@ -1970,8 +1970,8 @@
"tags": [
"Agents"
],
"summary": "List input items for a given OpenAI response.",
"description": "List input items for a given OpenAI response.",
"summary": "List input items.",
"description": "List input items.",
"parameters": [
{
"name": "response_id",
@ -2063,8 +2063,8 @@
"tags": [
"Safety"
],
"summary": "Run a shield.",
"description": "Run a shield.",
"summary": "Run shield.",
"description": "Run shield.\nRun a shield.",
"parameters": [],
"requestBody": {
"content": {
@ -4196,8 +4196,8 @@
"tags": [
"Inspect"
],
"summary": "Get the version of the service.",
"description": "Get the version of the service.",
"summary": "Get version.",
"description": "Get version.\nGet the version of the service.",
"parameters": [],
"deprecated": false
}
@ -18487,16 +18487,18 @@
},
{
"name": "Files",
"description": ""
"description": "This API is used to upload documents that can be used with other Llama Stack APIs.",
"x-displayName": "Files"
},
{
"name": "Inference",
"description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
"x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings."
"description": "Llama Stack Inference API for generating completions, chat completions, and embeddings.\n\nThis API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
"x-displayName": "Inference"
},
{
"name": "Inspect",
"description": ""
"description": "APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.",
"x-displayName": "Inspect"
},
{
"name": "Models",
@ -18508,17 +18510,18 @@
},
{
"name": "Prompts",
"description": "",
"x-displayName": "Protocol for prompt management operations."
"description": "Protocol for prompt management operations.",
"x-displayName": "Prompts"
},
{
"name": "Providers",
"description": "",
"x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
"description": "Providers API for inspecting, listing, and modifying providers and their configurations.",
"x-displayName": "Providers"
},
{
"name": "Safety",
"description": ""
"description": "OpenAI-compatible Moderations API.",
"x-displayName": "Safety"
},
{
"name": "Scoring",

View file

@ -36,8 +36,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: List all chat completions.
description: List all chat completions.
summary: List chat completions.
description: List chat completions.
parameters:
- name: after
in: query
@ -90,10 +90,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
summary: Create chat completions.
description: >-
Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using
the specified model.
parameters: []
@ -125,8 +125,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: Describe a chat completion by its ID.
description: Describe a chat completion by its ID.
summary: Get chat completion.
description: >-
Get chat completion.
Describe a chat completion by its ID.
parameters:
- name: completion_id
in: path
@ -156,10 +159,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
summary: Create completion.
description: >-
Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified
model.
parameters: []
@ -606,10 +609,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inference
summary: >-
Generate OpenAI-compatible embeddings for the given input using the specified
model.
summary: Create embeddings.
description: >-
Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified
model.
parameters: []
@ -642,9 +645,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Returns a list of files that belong to the user's organization.
summary: List files.
description: >-
List files.
Returns a list of files that belong to the user's organization.
parameters:
- name: after
@ -702,11 +706,13 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Upload a file that can be used across various endpoints.
summary: Upload file.
description: >-
Upload file.
Upload a file that can be used across various endpoints.
The file upload should be a multipart form request with:
- file: The File object (not file name) to be uploaded.
@ -755,9 +761,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Returns information about a specific file.
summary: Retrieve file.
description: >-
Retrieve file.
Returns information about a specific file.
parameters:
- name: file_id
@ -789,8 +796,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: Delete a file.
description: Delete a file.
summary: Delete file.
description: Delete file.
parameters:
- name: file_id
in: path
@ -822,9 +829,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Files
summary: >-
Returns the contents of the specified file.
summary: Retrieve file content.
description: >-
Retrieve file content.
Returns the contents of the specified file.
parameters:
- name: file_id
@ -857,9 +865,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inspect
summary: >-
Get the current health status of the service.
summary: Get health status.
description: >-
Get health status.
Get the current health status of the service.
parameters: []
deprecated: false
@ -885,9 +894,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inspect
summary: >-
List all available API routes with their methods and implementing providers.
summary: List routes.
description: >-
List routes.
List all available API routes with their methods and implementing providers.
parameters: []
deprecated: false
@ -936,8 +946,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Models
summary: Register a model.
description: Register a model.
summary: Register model.
description: >-
Register model.
Register a model.
parameters: []
requestBody:
content:
@ -967,8 +980,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Models
summary: Get a model by its identifier.
description: Get a model by its identifier.
summary: Get model.
description: >-
Get model.
Get a model by its identifier.
parameters:
- name: model_id
in: path
@ -993,8 +1009,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Models
summary: Unregister a model.
description: Unregister a model.
summary: Unregister model.
description: >-
Unregister model.
Unregister a model.
parameters:
- name: model_id
in: path
@ -1025,9 +1044,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Safety
summary: >-
Classifies if text and/or image inputs are potentially harmful.
summary: Create moderation.
description: >-
Create moderation.
Classifies if text and/or image inputs are potentially harmful.
parameters: []
requestBody:
@ -1083,8 +1103,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: Create a new prompt.
description: Create a new prompt.
summary: Create prompt.
description: >-
Create prompt.
Create a new prompt.
parameters: []
requestBody:
content:
@ -1114,9 +1137,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: >-
Get a prompt by its identifier and optional version.
summary: Get prompt.
description: >-
Get prompt.
Get a prompt by its identifier and optional version.
parameters:
- name: prompt_id
@ -1154,9 +1178,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: >-
Update an existing prompt (increments version).
summary: Update prompt.
description: >-
Update prompt.
Update an existing prompt (increments version).
parameters:
- name: prompt_id
@ -1188,8 +1213,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: Delete a prompt.
description: Delete a prompt.
summary: Delete prompt.
description: >-
Delete prompt.
Delete a prompt.
parameters:
- name: prompt_id
in: path
@ -1220,9 +1248,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: >-
Set which version of a prompt should be the default in get_prompt (latest).
summary: Set prompt version.
description: >-
Set prompt version.
Set which version of a prompt should be the default in get_prompt (latest).
parameters:
- name: prompt_id
@ -1260,8 +1289,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Prompts
summary: List all versions of a specific prompt.
description: List all versions of a specific prompt.
summary: List prompt versions.
description: >-
List prompt versions.
List all versions of a specific prompt.
parameters:
- name: prompt_id
in: path
@ -1293,8 +1325,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Providers
summary: List all available providers.
description: List all available providers.
summary: List providers.
description: >-
List providers.
List all available providers.
parameters: []
deprecated: false
/v1/providers/{provider_id}:
@ -1319,9 +1354,10 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Providers
summary: >-
Get detailed information about a specific provider.
summary: Get provider.
description: >-
Get provider.
Get detailed information about a specific provider.
parameters:
- name: provider_id
@ -1352,8 +1388,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: List all OpenAI responses.
description: List all OpenAI responses.
summary: List all responses.
description: List all responses.
parameters:
- name: after
in: query
@ -1404,8 +1440,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: Create a new OpenAI response.
description: Create a new OpenAI response.
summary: Create a model response.
description: Create a model response.
parameters: []
requestBody:
content:
@ -1447,8 +1483,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: Retrieve an OpenAI response by its ID.
description: Retrieve an OpenAI response by its ID.
summary: Get a model response.
description: Get a model response.
parameters:
- name: response_id
in: path
@ -1478,8 +1514,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: Delete an OpenAI response by its ID.
description: Delete an OpenAI response by its ID.
summary: Delete a response.
description: Delete a response.
parameters:
- name: response_id
in: path
@ -1509,10 +1545,8 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Agents
summary: >-
List input items for a given OpenAI response.
description: >-
List input items for a given OpenAI response.
summary: List input items.
description: List input items.
parameters:
- name: response_id
in: path
@ -1581,8 +1615,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Safety
summary: Run a shield.
description: Run a shield.
summary: Run shield.
description: >-
Run shield.
Run a shield.
parameters: []
requestBody:
content:
@ -3138,8 +3175,11 @@ paths:
$ref: '#/components/responses/DefaultError'
tags:
- Inspect
summary: Get the version of the service.
description: Get the version of the service.
summary: Get version.
description: >-
Get version.
Get the version of the service.
parameters: []
deprecated: false
/v1beta/datasetio/append-rows/{dataset_id}:
@ -13795,9 +13835,16 @@ tags:
x-displayName: >-
Llama Stack Evaluation API for running evaluations on model and agent candidates.
- name: Files
description: ''
description: >-
This API is used to upload documents that can be used with other Llama Stack
APIs.
x-displayName: Files
- name: Inference
description: >-
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
This API provides the raw interface to the underlying models. Two kinds of models
are supported:
@ -13805,25 +13852,27 @@ tags:
- Embedding models: these models generate embeddings to be used for semantic
search.
x-displayName: >-
Llama Stack Inference API for generating completions, chat completions, and
embeddings.
x-displayName: Inference
- name: Inspect
description: ''
description: >-
APIs for inspecting the Llama Stack service, including health status, available
API routes with methods and implementing providers.
x-displayName: Inspect
- name: Models
description: ''
- name: PostTraining (Coming Soon)
description: ''
- name: Prompts
description: ''
x-displayName: >-
description: >-
Protocol for prompt management operations.
x-displayName: Prompts
- name: Providers
description: ''
x-displayName: >-
description: >-
Providers API for inspecting, listing, and modifying providers and their configurations.
x-displayName: Providers
- name: Safety
description: ''
description: OpenAI-compatible Moderations API.
x-displayName: Safety
- name: Scoring
description: ''
- name: ScoringFunctions

View file

@ -88,7 +88,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
...
Build Successful!
You can find the newly-built template here: ~/.llama/distributions/starter/starter-run.yaml
You can run the new Llama Stack Distro via: uv run --with llama-stack llama stack run starter --image-type venv
You can run the new Llama Stack Distro via: uv run --with llama-stack llama stack run starter
```
3. **Set the ENV variables by exporting them to the terminal**:
@ -102,12 +102,11 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
3. **Run the Llama Stack**:
Run the stack using uv:
```bash
INFERENCE_MODEL=$INFERENCE_MODEL \
SAFETY_MODEL=$SAFETY_MODEL \
OLLAMA_URL=$OLLAMA_URL \
uv run --with llama-stack llama stack run starter \
--image-type venv \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env OLLAMA_URL=$OLLAMA_URL
--port $LLAMA_STACK_PORT
```
Note: Every time you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.

View file

@ -797,7 +797,7 @@ class Agents(Protocol):
self,
response_id: str,
) -> OpenAIResponseObject:
"""Retrieve an OpenAI response by its ID.
"""Get a model response.
:param response_id: The ID of the OpenAI response to retrieve.
:returns: An OpenAIResponseObject.
@ -826,7 +826,7 @@ class Agents(Protocol):
),
] = None,
) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
"""Create a new OpenAI response.
"""Create a model response.
:param input: Input message(s) to create the response.
:param model: The underlying LLM used for completions.
@ -846,7 +846,7 @@ class Agents(Protocol):
model: str | None = None,
order: Order | None = Order.desc,
) -> ListOpenAIResponseObject:
"""List all OpenAI responses.
"""List all responses.
:param after: The ID of the last response to return.
:param limit: The number of responses to return.
@ -869,7 +869,7 @@ class Agents(Protocol):
limit: int | None = 20,
order: Order | None = Order.desc,
) -> ListOpenAIResponseInputItem:
"""List input items for a given OpenAI response.
"""List input items.
:param response_id: The ID of the response to retrieve input items for.
:param after: An item ID to list items after, used for pagination.
@ -884,7 +884,7 @@ class Agents(Protocol):
@webmethod(route="/openai/v1/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1, deprecated=True)
@webmethod(route="/responses/{response_id}", method="DELETE", level=LLAMA_STACK_API_V1)
async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
"""Delete an OpenAI response by its ID.
"""Delete a response.
:param response_id: The ID of the OpenAI response to delete.
:returns: An OpenAIDeleteResponseObject

View file

@ -104,6 +104,11 @@ class OpenAIFileDeleteResponse(BaseModel):
@runtime_checkable
@trace_protocol
class Files(Protocol):
"""Files
This API is used to upload documents that can be used with other Llama Stack APIs.
"""
# OpenAI Files API Endpoints
@webmethod(route="/openai/v1/files", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
@webmethod(route="/files", method="POST", level=LLAMA_STACK_API_V1)
@ -113,7 +118,8 @@ class Files(Protocol):
purpose: Annotated[OpenAIFilePurpose, Form()],
expires_after: Annotated[ExpiresAfter | None, Form()] = None,
) -> OpenAIFileObject:
"""
"""Upload file.
Upload a file that can be used across various endpoints.
The file upload should be a multipart form request with:
@ -137,7 +143,8 @@ class Files(Protocol):
order: Order | None = Order.desc,
purpose: OpenAIFilePurpose | None = None,
) -> ListOpenAIFileResponse:
"""
"""List files.
Returns a list of files that belong to the user's organization.
:param after: A cursor for use in pagination. `after` is an object ID that defines your place in the list. For instance, if you make a list request and receive 100 objects, ending with obj_foo, your subsequent call can include after=obj_foo in order to fetch the next page of the list.
@ -154,7 +161,8 @@ class Files(Protocol):
self,
file_id: str,
) -> OpenAIFileObject:
"""
"""Retrieve file.
Returns information about a specific file.
:param file_id: The ID of the file to use for this request.
@ -168,8 +176,7 @@ class Files(Protocol):
self,
file_id: str,
) -> OpenAIFileDeleteResponse:
"""
Delete a file.
"""Delete file.
:param file_id: The ID of the file to use for this request.
:returns: An OpenAIFileDeleteResponse indicating successful deletion.
@ -182,7 +189,8 @@ class Files(Protocol):
self,
file_id: str,
) -> Response:
"""
"""Retrieve file content.
Returns the contents of the specified file.
:param file_id: The ID of the file to use for this request.

View file

@ -1053,7 +1053,9 @@ class InferenceProvider(Protocol):
# for fill-in-the-middle type completion
suffix: str | None = None,
) -> OpenAICompletion:
"""Generate an OpenAI-compatible completion for the given prompt using the specified model.
"""Create completion.
Generate an OpenAI-compatible completion for the given prompt using the specified model.
:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param prompt: The prompt to generate a completion for.
@ -1105,7 +1107,9 @@ class InferenceProvider(Protocol):
top_p: float | None = None,
user: str | None = None,
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
"""Generate an OpenAI-compatible chat completion for the given messages using the specified model.
"""Create chat completions.
Generate an OpenAI-compatible chat completion for the given messages using the specified model.
:param model: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param messages: List of messages in the conversation.
@ -1144,7 +1148,9 @@ class InferenceProvider(Protocol):
dimensions: int | None = None,
user: str | None = None,
) -> OpenAIEmbeddingsResponse:
"""Generate OpenAI-compatible embeddings for the given input using the specified model.
"""Create embeddings.
Generate OpenAI-compatible embeddings for the given input using the specified model.
:param model: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
:param input: Input text to embed, encoded as a string or array of strings. To embed multiple inputs in a single request, pass an array of strings.
@ -1157,7 +1163,9 @@ class InferenceProvider(Protocol):
class Inference(InferenceProvider):
"""Llama Stack Inference API for generating completions, chat completions, and embeddings.
"""Inference
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions.
@ -1173,7 +1181,7 @@ class Inference(InferenceProvider):
model: str | None = None,
order: Order | None = Order.desc,
) -> ListOpenAIChatCompletionResponse:
"""List all chat completions.
"""List chat completions.
:param after: The ID of the last chat completion to return.
:param limit: The maximum number of chat completions to return.
@ -1188,7 +1196,9 @@ class Inference(InferenceProvider):
)
@webmethod(route="/chat/completions/{completion_id}", method="GET", level=LLAMA_STACK_API_V1)
async def get_chat_completion(self, completion_id: str) -> OpenAICompletionWithInputMessages:
"""Describe a chat completion by its ID.
"""Get chat completion.
Describe a chat completion by its ID.
:param completion_id: ID of the chat completion.
:returns: A OpenAICompletionWithInputMessages.

View file

@ -58,9 +58,16 @@ class ListRoutesResponse(BaseModel):
@runtime_checkable
class Inspect(Protocol):
"""Inspect
APIs for inspecting the Llama Stack service, including health status, available API routes with methods and implementing providers.
"""
@webmethod(route="/inspect/routes", method="GET", level=LLAMA_STACK_API_V1)
async def list_routes(self) -> ListRoutesResponse:
"""List all available API routes with their methods and implementing providers.
"""List routes.
List all available API routes with their methods and implementing providers.
:returns: Response containing information about all available routes.
"""
@ -68,7 +75,9 @@ class Inspect(Protocol):
@webmethod(route="/health", method="GET", level=LLAMA_STACK_API_V1)
async def health(self) -> HealthInfo:
"""Get the current health status of the service.
"""Get health status.
Get the current health status of the service.
:returns: Health information indicating if the service is operational.
"""
@ -76,7 +85,9 @@ class Inspect(Protocol):
@webmethod(route="/version", method="GET", level=LLAMA_STACK_API_V1)
async def version(self) -> VersionInfo:
"""Get the version of the service.
"""Get version.
Get the version of the service.
:returns: Version information containing the service version number.
"""

View file

@ -124,7 +124,9 @@ class Models(Protocol):
self,
model_id: str,
) -> Model:
"""Get a model by its identifier.
"""Get model.
Get a model by its identifier.
:param model_id: The identifier of the model to get.
:returns: A Model.
@ -140,7 +142,9 @@ class Models(Protocol):
metadata: dict[str, Any] | None = None,
model_type: ModelType | None = None,
) -> Model:
"""Register a model.
"""Register model.
Register a model.
:param model_id: The identifier of the model to register.
:param provider_model_id: The identifier of the model in the provider.
@ -156,7 +160,9 @@ class Models(Protocol):
self,
model_id: str,
) -> None:
"""Unregister a model.
"""Unregister model.
Unregister a model.
:param model_id: The identifier of the model to unregister.
"""

View file

@ -94,7 +94,9 @@ class ListPromptsResponse(BaseModel):
@runtime_checkable
@trace_protocol
class Prompts(Protocol):
"""Protocol for prompt management operations."""
"""Prompts
Protocol for prompt management operations."""
@webmethod(route="/prompts", method="GET", level=LLAMA_STACK_API_V1)
async def list_prompts(self) -> ListPromptsResponse:
@ -109,7 +111,9 @@ class Prompts(Protocol):
self,
prompt_id: str,
) -> ListPromptsResponse:
"""List all versions of a specific prompt.
"""List prompt versions.
List all versions of a specific prompt.
:param prompt_id: The identifier of the prompt to list versions for.
:returns: A ListPromptsResponse containing all versions of the prompt.
@ -122,7 +126,9 @@ class Prompts(Protocol):
prompt_id: str,
version: int | None = None,
) -> Prompt:
"""Get a prompt by its identifier and optional version.
"""Get prompt.
Get a prompt by its identifier and optional version.
:param prompt_id: The identifier of the prompt to get.
:param version: The version of the prompt to get (defaults to latest).
@ -136,7 +142,9 @@ class Prompts(Protocol):
prompt: str,
variables: list[str] | None = None,
) -> Prompt:
"""Create a new prompt.
"""Create prompt.
Create a new prompt.
:param prompt: The prompt text content with variable placeholders.
:param variables: List of variable names that can be used in the prompt template.
@ -153,7 +161,9 @@ class Prompts(Protocol):
variables: list[str] | None = None,
set_as_default: bool = True,
) -> Prompt:
"""Update an existing prompt (increments version).
"""Update prompt.
Update an existing prompt (increments version).
:param prompt_id: The identifier of the prompt to update.
:param prompt: The updated prompt text content.
@ -169,7 +179,9 @@ class Prompts(Protocol):
self,
prompt_id: str,
) -> None:
"""Delete a prompt.
"""Delete prompt.
Delete a prompt.
:param prompt_id: The identifier of the prompt to delete.
"""
@ -181,7 +193,9 @@ class Prompts(Protocol):
prompt_id: str,
version: int,
) -> Prompt:
"""Set which version of a prompt should be the default in get_prompt (latest).
"""Set prompt version.
Set which version of a prompt should be the default in get_prompt (latest).
:param prompt_id: The identifier of the prompt.
:param version: The version to set as default.

View file

@ -42,13 +42,16 @@ class ListProvidersResponse(BaseModel):
@runtime_checkable
class Providers(Protocol):
"""
"""Providers
Providers API for inspecting, listing, and modifying providers and their configurations.
"""
@webmethod(route="/providers", method="GET", level=LLAMA_STACK_API_V1)
async def list_providers(self) -> ListProvidersResponse:
"""List all available providers.
"""List providers.
List all available providers.
:returns: A ListProvidersResponse containing information about all providers.
"""
@ -56,7 +59,9 @@ class Providers(Protocol):
@webmethod(route="/providers/{provider_id}", method="GET", level=LLAMA_STACK_API_V1)
async def inspect_provider(self, provider_id: str) -> ProviderInfo:
"""Get detailed information about a specific provider.
"""Get provider.
Get detailed information about a specific provider.
:param provider_id: The ID of the provider to inspect.
:returns: A ProviderInfo object containing the provider's details.

View file

@ -96,6 +96,11 @@ class ShieldStore(Protocol):
@runtime_checkable
@trace_protocol
class Safety(Protocol):
"""Safety
OpenAI-compatible Moderations API.
"""
shield_store: ShieldStore
@webmethod(route="/safety/run-shield", method="POST", level=LLAMA_STACK_API_V1)
@ -105,7 +110,9 @@ class Safety(Protocol):
messages: list[Message],
params: dict[str, Any],
) -> RunShieldResponse:
"""Run a shield.
"""Run shield.
Run a shield.
:param shield_id: The identifier of the shield to run.
:param messages: The messages to run the shield on.
@ -117,7 +124,9 @@ class Safety(Protocol):
@webmethod(route="/openai/v1/moderations", method="POST", level=LLAMA_STACK_API_V1, deprecated=True)
@webmethod(route="/moderations", method="POST", level=LLAMA_STACK_API_V1)
async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
"""Classifies if text and/or image inputs are potentially harmful.
"""Create moderation.
Classifies if text and/or image inputs are potentially harmful.
:param input: Input (or inputs) to classify.
Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models.
:param model: The content moderation model you would like to use.

View file

@ -444,12 +444,24 @@ def _run_stack_build_command_from_build_config(
cprint("Build Successful!", color="green", file=sys.stderr)
cprint(f"You can find the newly-built distribution here: {run_config_file}", color="blue", file=sys.stderr)
cprint(
"You can run the new Llama Stack distro via: "
+ colored(f"llama stack run {run_config_file} --image-type {build_config.image_type}", "blue"),
color="green",
file=sys.stderr,
)
if build_config.image_type == LlamaStackImageType.VENV:
cprint(
"You can run the new Llama Stack distro (after activating "
+ colored(image_name, "cyan")
+ ") via: "
+ colored(f"llama stack run {run_config_file}", "blue"),
color="green",
file=sys.stderr,
)
elif build_config.image_type == LlamaStackImageType.CONTAINER:
cprint(
"You can run the container with: "
+ colored(
f"docker run -p 8321:8321 -v ~/.llama:/root/.llama localhost/{image_name} --port 8321", "blue"
),
color="green",
file=sys.stderr,
)
return distro_path
else:
return _generate_run_config(build_config, build_dir, image_name)

View file

@ -6,11 +6,18 @@
import argparse
import os
import ssl
import subprocess
from pathlib import Path
import uvicorn
import yaml
from llama_stack.cli.stack.utils import ImageType
from llama_stack.cli.subcommand import Subcommand
from llama_stack.core.datatypes import LoggingConfig, StackRunConfig
from llama_stack.core.stack import cast_image_name_to_string, replace_env_vars
from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
from llama_stack.log import get_logger
REPO_ROOT = Path(__file__).parent.parent.parent.parent
@ -48,18 +55,12 @@ class StackRun(Subcommand):
"--image-name",
type=str,
default=None,
help="Name of the image to run. Defaults to the current environment",
)
self.parser.add_argument(
"--env",
action="append",
help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
metavar="KEY=VALUE",
help="[DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running.",
)
self.parser.add_argument(
"--image-type",
type=str,
help="Image Type used during the build. This can be only venv.",
help="[DEPRECATED] This flag is no longer supported. Please activate your virtual environment before running.",
choices=[e.value for e in ImageType if e.value != ImageType.CONTAINER.value],
)
self.parser.add_argument(
@ -68,48 +69,22 @@ class StackRun(Subcommand):
help="Start the UI server",
)
def _resolve_config_and_distro(self, args: argparse.Namespace) -> tuple[Path | None, str | None]:
"""Resolve config file path and distribution name from args.config"""
from llama_stack.core.utils.config_dirs import DISTRIBS_BASE_DIR
if not args.config:
return None, None
config_file = Path(args.config)
has_yaml_suffix = args.config.endswith(".yaml")
distro_name = None
if not config_file.exists() and not has_yaml_suffix:
# check if this is a distribution
config_file = Path(REPO_ROOT) / "llama_stack" / "distributions" / args.config / "run.yaml"
if config_file.exists():
distro_name = args.config
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to ~/.llama dir
config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
if not config_file.exists():
self.parser.error(
f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
)
if not config_file.is_file():
self.parser.error(
f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
)
return config_file, distro_name
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
import yaml
from llama_stack.core.configure import parse_and_maybe_upgrade_config
from llama_stack.core.utils.exec import formulate_run_args, run_command
if args.image_type or args.image_name:
self.parser.error(
"The --image-type and --image-name flags are no longer supported.\n\n"
"Please activate your virtual environment manually before running `llama stack run`.\n\n"
"For example:\n"
" source /path/to/venv/bin/activate\n"
" llama stack run <config>\n"
)
if args.enable_ui:
self._start_ui_development_server(args.port)
image_type, image_name = args.image_type, args.image_name
if args.config:
try:
@ -121,10 +96,6 @@ class StackRun(Subcommand):
else:
config_file = None
# Check if config is required based on image type
if image_type == ImageType.VENV.value and not config_file:
self.parser.error("Config file is required for venv environment")
if config_file:
logger.info(f"Using run configuration: {config_file}")
@ -139,50 +110,67 @@ class StackRun(Subcommand):
os.makedirs(str(config.external_providers_dir), exist_ok=True)
except AttributeError as e:
self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
self._uvicorn_run(config_file, args)
def _uvicorn_run(self, config_file: Path | None, args: argparse.Namespace) -> None:
if not config_file:
self.parser.error("Config file is required")
config_file = resolve_config_or_distro(str(config_file), Mode.RUN)
with open(config_file) as fp:
config_contents = yaml.safe_load(fp)
if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
logger_config = LoggingConfig(**cfg)
else:
logger_config = None
config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
port = args.port or config.server.port
host = config.server.host or ["::", "0.0.0.0"]
# Set the config file in environment so create_app can find it
os.environ["LLAMA_STACK_CONFIG"] = str(config_file)
uvicorn_config = {
"factory": True,
"host": host,
"port": port,
"lifespan": "on",
"log_level": logger.getEffectiveLevel(),
"log_config": logger_config,
}
keyfile = config.server.tls_keyfile
certfile = config.server.tls_certfile
if keyfile and certfile:
uvicorn_config["ssl_keyfile"] = config.server.tls_keyfile
uvicorn_config["ssl_certfile"] = config.server.tls_certfile
if config.server.tls_cafile:
uvicorn_config["ssl_ca_certs"] = config.server.tls_cafile
uvicorn_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
logger.info(
f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}\n CA: {config.server.tls_cafile}"
)
else:
config = None
logger.info(f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}")
# If neither image type nor image name is provided, assume the server should be run directly
# using the current environment packages.
if not image_type and not image_name:
logger.info("No image type or image name provided. Assuming environment packages.")
from llama_stack.core.server.server import main as server_main
logger.info(f"Listening on {host}:{port}")
# Build the server args from the current args passed to the CLI
server_args = argparse.Namespace()
for arg in vars(args):
# If this is a function, avoid passing it
# "args" contains:
# func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
if callable(getattr(args, arg)):
continue
if arg == "config":
server_args.config = str(config_file)
else:
setattr(server_args, arg, getattr(args, arg))
# Run the server
server_main(server_args)
else:
run_args = formulate_run_args(image_type, image_name)
run_args.extend([str(args.port)])
if config_file:
run_args.extend(["--config", str(config_file)])
if args.env:
for env_var in args.env:
if "=" not in env_var:
self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
return
key, value = env_var.split("=", 1) # split on first = only
if not key:
self.parser.error(f"Environment variable '{env_var}' has empty key")
return
run_args.extend(["--env", f"{key}={value}"])
run_command(run_args)
# We need to catch KeyboardInterrupt because uvicorn's signal handling
# re-raises SIGINT signals using signal.raise_signal(), which Python
# converts to KeyboardInterrupt. Without this catch, we'd get a confusing
# stack trace when using Ctrl+C or kill -2 (SIGINT).
# SIGTERM (kill -15) works fine without this because Python doesn't
# have a default handler for it.
#
# Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
# signal handling but this is quite intrusive and not worth the effort.
try:
uvicorn.run("llama_stack.core.server.server:create_app", **uvicorn_config)
except (KeyboardInterrupt, SystemExit):
logger.info("Received interrupt signal, shutting down gracefully...")
def _start_ui_development_server(self, stack_server_port: int):
logger.info("Attempting to start UI development server...")

View file

@ -324,14 +324,14 @@ fi
RUN pip uninstall -y uv
EOF
# If a run config is provided, we use the --config flag
# If a run config is provided, we use the llama stack CLI
if [[ -n "$run_config" ]]; then
add_to_container << EOF
ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$RUN_CONFIG_PATH"]
ENTRYPOINT ["llama", "stack", "run", "$RUN_CONFIG_PATH"]
EOF
elif [[ "$distro_or_config" != *.yaml ]]; then
add_to_container << EOF
ENTRYPOINT ["python", "-m", "llama_stack.core.server.server", "$distro_or_config"]
ENTRYPOINT ["llama", "stack", "run", "$distro_or_config"]
EOF
fi

View file

@ -32,7 +32,7 @@ from llama_stack.providers.utils.sqlstore.sqlstore import (
sqlstore_impl,
)
logger = get_logger(name=__name__, category="openai::conversations")
logger = get_logger(name=__name__, category="openai_conversations")
class ConversationServiceConfig(BaseModel):

View file

@ -243,6 +243,7 @@ def get_external_providers_from_module(
spec = module.get_provider_spec()
else:
# pass in a partially filled out provider spec to satisfy the registry -- knowing we will be overwriting it later upon build and run
# in the case we are building we CANNOT import this module of course because it has not been installed.
spec = ProviderSpec(
api=Api(provider_api),
provider_type=provider.provider_type,
@ -251,9 +252,20 @@ def get_external_providers_from_module(
config_class="",
)
provider_type = provider.provider_type
# in the case we are building we CANNOT import this module of course because it has not been installed.
# return a partially filled out spec that the build script will populate.
registry[Api(provider_api)][provider_type] = spec
if isinstance(spec, list):
# optionally allow people to pass inline and remote provider specs as a returned list.
# with the old method, users could pass in directories of specs using overlapping code
# we want to ensure we preserve that flexibility in this method.
logger.info(
f"Detected a list of external provider specs from {provider.module} adding all to the registry"
)
for provider_spec in spec:
if provider_spec.provider_type != provider.provider_type:
continue
logger.info(f"Adding {provider.provider_type} to registry")
registry[Api(provider_api)][provider.provider_type] = provider_spec
else:
registry[Api(provider_api)][provider_type] = spec
except ModuleNotFoundError as exc:
raise ValueError(
"get_provider_spec not found. If specifying an external provider via `module` in the Provider spec, the Provider must have the `provider.get_provider_spec` module available"

View file

@ -611,7 +611,7 @@ class InferenceRouter(Inference):
completion_text += "".join(choice_data["content_parts"])
# Add metrics to the chunk
if self.telemetry and chunk.usage:
if self.telemetry and hasattr(chunk, "usage") and chunk.usage:
metrics = self._construct_metrics(
prompt_tokens=chunk.usage.prompt_tokens,
completion_tokens=chunk.usage.completion_tokens,

View file

@ -33,7 +33,7 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
try:
models = await provider.list_models()
except Exception as e:
logger.warning(f"Model refresh failed for provider {provider_id}: {e}")
logger.debug(f"Model refresh failed for provider {provider_id}: {e}")
continue
self.listed_providers.add(provider_id)
@ -67,6 +67,19 @@ class ModelsRoutingTable(CommonRoutingTableImpl, Models):
raise ValueError(f"Provider {model.provider_id} not found in the routing table")
return self.impls_by_provider_id[model.provider_id]
async def has_model(self, model_id: str) -> bool:
"""
Check if a model exists in the routing table.
:param model_id: The model identifier to check
:return: True if the model exists, False otherwise
"""
try:
await lookup_model(self, model_id)
return True
except ModelNotFoundError:
return False
async def register_model(
self,
model_id: str,

View file

@ -245,3 +245,65 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
vector_store_id=vector_store_id,
file_id=file_id,
)
async def openai_create_vector_store_file_batch(
self,
vector_store_id: str,
file_ids: list[str],
attributes: dict[str, Any] | None = None,
chunking_strategy: Any | None = None,
):
await self.assert_action_allowed("update", "vector_db", vector_store_id)
provider = await self.get_provider_impl(vector_store_id)
return await provider.openai_create_vector_store_file_batch(
vector_store_id=vector_store_id,
file_ids=file_ids,
attributes=attributes,
chunking_strategy=chunking_strategy,
)
async def openai_retrieve_vector_store_file_batch(
self,
batch_id: str,
vector_store_id: str,
):
await self.assert_action_allowed("read", "vector_db", vector_store_id)
provider = await self.get_provider_impl(vector_store_id)
return await provider.openai_retrieve_vector_store_file_batch(
batch_id=batch_id,
vector_store_id=vector_store_id,
)
async def openai_list_files_in_vector_store_file_batch(
self,
batch_id: str,
vector_store_id: str,
after: str | None = None,
before: str | None = None,
filter: str | None = None,
limit: int | None = 20,
order: str | None = "desc",
):
await self.assert_action_allowed("read", "vector_db", vector_store_id)
provider = await self.get_provider_impl(vector_store_id)
return await provider.openai_list_files_in_vector_store_file_batch(
batch_id=batch_id,
vector_store_id=vector_store_id,
after=after,
before=before,
filter=filter,
limit=limit,
order=order,
)
async def openai_cancel_vector_store_file_batch(
self,
batch_id: str,
vector_store_id: str,
):
await self.assert_action_allowed("update", "vector_db", vector_store_id)
provider = await self.get_provider_impl(vector_store_id)
return await provider.openai_cancel_vector_store_file_batch(
batch_id=batch_id,
vector_store_id=vector_store_id,
)

View file

@ -4,7 +4,6 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import argparse
import asyncio
import concurrent.futures
import functools
@ -12,7 +11,6 @@ import inspect
import json
import logging # allow-direct-logging
import os
import ssl
import sys
import traceback
import warnings
@ -35,7 +33,6 @@ from pydantic import BaseModel, ValidationError
from llama_stack.apis.common.errors import ConflictError, ResourceNotFoundError
from llama_stack.apis.common.responses import PaginatedResponse
from llama_stack.cli.utils import add_config_distro_args, get_config_from_args
from llama_stack.core.access_control.access_control import AccessDeniedError
from llama_stack.core.datatypes import (
AuthenticationRequiredError,
@ -55,7 +52,6 @@ from llama_stack.core.stack import (
Stack,
cast_image_name_to_string,
replace_env_vars,
validate_env_pair,
)
from llama_stack.core.utils.config import redact_sensitive_fields
from llama_stack.core.utils.config_resolution import Mode, resolve_config_or_distro
@ -333,23 +329,18 @@ class ClientVersionMiddleware:
return await self.app(scope, receive, send)
def create_app(
config_file: str | None = None,
env_vars: list[str] | None = None,
) -> StackApp:
def create_app() -> StackApp:
"""Create and configure the FastAPI application.
Args:
config_file: Path to config file. If None, uses LLAMA_STACK_CONFIG env var or default resolution.
env_vars: List of environment variables in KEY=value format.
disable_version_check: Whether to disable version checking. If None, uses LLAMA_STACK_DISABLE_VERSION_CHECK env var.
This factory function reads configuration from environment variables:
- LLAMA_STACK_CONFIG: Path to config file (required)
Returns:
Configured StackApp instance.
"""
config_file = config_file or os.getenv("LLAMA_STACK_CONFIG")
config_file = os.getenv("LLAMA_STACK_CONFIG")
if config_file is None:
raise ValueError("No config file provided and LLAMA_STACK_CONFIG env var is not set")
raise ValueError("LLAMA_STACK_CONFIG environment variable is required")
config_file = resolve_config_or_distro(config_file, Mode.RUN)
@ -361,16 +352,6 @@ def create_app(
logger_config = LoggingConfig(**cfg)
logger = get_logger(name=__name__, category="core::server", config=logger_config)
if env_vars:
for env_pair in env_vars:
try:
key, value = validate_env_pair(env_pair)
logger.info(f"Setting environment variable {key} => {value}")
os.environ[key] = value
except ValueError as e:
logger.error(f"Error: {str(e)}")
raise ValueError(f"Invalid environment variable format: {env_pair}") from e
config = replace_env_vars(config_contents)
config = StackRunConfig(**cast_image_name_to_string(config))
@ -494,101 +475,6 @@ def create_app(
return app
def main(args: argparse.Namespace | None = None):
"""Start the LlamaStack server."""
parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
add_config_distro_args(parser)
parser.add_argument(
"--port",
type=int,
default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
help="Port to listen on",
)
parser.add_argument(
"--env",
action="append",
help="Environment variables in KEY=value format. Can be specified multiple times.",
)
# Determine whether the server args are being passed by the "run" command, if this is the case
# the args will be passed as a Namespace object to the main function, otherwise they will be
# parsed from the command line
if args is None:
args = parser.parse_args()
config_or_distro = get_config_from_args(args)
try:
app = create_app(
config_file=config_or_distro,
env_vars=args.env,
)
except Exception as e:
logger.error(f"Error creating app: {str(e)}")
sys.exit(1)
config_file = resolve_config_or_distro(config_or_distro, Mode.RUN)
with open(config_file) as fp:
config_contents = yaml.safe_load(fp)
if isinstance(config_contents, dict) and (cfg := config_contents.get("logging_config")):
logger_config = LoggingConfig(**cfg)
else:
logger_config = None
config = StackRunConfig(**cast_image_name_to_string(replace_env_vars(config_contents)))
import uvicorn
# Configure SSL if certificates are provided
port = args.port or config.server.port
ssl_config = None
keyfile = config.server.tls_keyfile
certfile = config.server.tls_certfile
if keyfile and certfile:
ssl_config = {
"ssl_keyfile": keyfile,
"ssl_certfile": certfile,
}
if config.server.tls_cafile:
ssl_config["ssl_ca_certs"] = config.server.tls_cafile
ssl_config["ssl_cert_reqs"] = ssl.CERT_REQUIRED
logger.info(
f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}\n CA: {config.server.tls_cafile}"
)
else:
logger.info(f"HTTPS enabled with certificates:\n Key: {keyfile}\n Cert: {certfile}")
listen_host = config.server.host or ["::", "0.0.0.0"]
logger.info(f"Listening on {listen_host}:{port}")
uvicorn_config = {
"app": app,
"host": listen_host,
"port": port,
"lifespan": "on",
"log_level": logger.getEffectiveLevel(),
"log_config": logger_config,
}
if ssl_config:
uvicorn_config.update(ssl_config)
# We need to catch KeyboardInterrupt because uvicorn's signal handling
# re-raises SIGINT signals using signal.raise_signal(), which Python
# converts to KeyboardInterrupt. Without this catch, we'd get a confusing
# stack trace when using Ctrl+C or kill -2 (SIGINT).
# SIGTERM (kill -15) works fine without this because Python doesn't
# have a default handler for it.
#
# Another approach would be to ignore SIGINT entirely - let uvicorn handle it through its own
# signal handling but this is quite intrusive and not worth the effort.
try:
asyncio.run(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
except (KeyboardInterrupt, SystemExit):
logger.info("Received interrupt signal, shutting down gracefully...")
def _log_run_config(run_config: StackRunConfig):
"""Logs the run config with redacted fields and disabled providers removed."""
logger.info("Run configuration:")
@ -615,7 +501,3 @@ def remove_disabled_providers(obj):
return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None]
else:
return obj
if __name__ == "__main__":
main()

View file

@ -274,22 +274,6 @@ def cast_image_name_to_string(config_dict: dict[str, Any]) -> dict[str, Any]:
return config_dict
def validate_env_pair(env_pair: str) -> tuple[str, str]:
"""Validate and split an environment variable key-value pair."""
try:
key, value = env_pair.split("=", 1)
key = key.strip()
if not key:
raise ValueError(f"Empty key in environment variable pair: {env_pair}")
if not all(c.isalnum() or c == "_" for c in key):
raise ValueError(f"Key must contain only alphanumeric characters and underscores: {key}")
return key, value
except ValueError as e:
raise ValueError(
f"Invalid environment variable format '{env_pair}': {str(e)}. Expected format: KEY=value"
) from e
def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConfig) -> None:
"""Add internal implementations (inspect and providers) to the implementations dictionary.

View file

@ -25,7 +25,7 @@ error_handler() {
trap 'error_handler ${LINENO}' ERR
if [ $# -lt 3 ]; then
echo "Usage: $0 <env_type> <env_path_or_name> <port> [--config <yaml_config>] [--env KEY=VALUE]..."
echo "Usage: $0 <env_type> <env_path_or_name> <port> [--config <yaml_config>]"
exit 1
fi
@ -43,7 +43,6 @@ SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
# Initialize variables
yaml_config=""
env_vars=""
other_args=""
# Process remaining arguments
@ -58,15 +57,6 @@ while [[ $# -gt 0 ]]; do
exit 1
fi
;;
--env)
if [[ -n "$2" ]]; then
env_vars="$env_vars --env $2"
shift 2
else
echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
exit 1
fi
;;
*)
other_args="$other_args $1"
shift
@ -116,10 +106,9 @@ if [[ "$env_type" == "venv" ]]; then
yaml_config_arg=""
fi
$PYTHON_BINARY -m llama_stack.core.server.server \
llama stack run \
$yaml_config_arg \
--port "$port" \
$env_vars \
$other_args
elif [[ "$env_type" == "container" ]]; then
echo -e "${RED}Warning: Llama Stack no longer supports running Containers via the 'llama stack run' command.${NC}"

View file

@ -98,7 +98,10 @@ class DiskDistributionRegistry(DistributionRegistry):
existing_obj = await self.get(obj.type, obj.identifier)
# dont register if the object's providerid already exists
if existing_obj and existing_obj.provider_id == obj.provider_id:
return False
raise ValueError(
f"Provider '{obj.provider_id}' is already registered."
f"Unregister the existing provider first before registering it again."
)
await self.kvstore.set(
KEY_FORMAT.format(type=obj.type, identifier=obj.identifier),

View file

@ -117,11 +117,11 @@ docker run -it \
# NOTE: mount the llama-stack directory if testing local changes else not needed
-v $HOME/git/llama-stack:/app/llama-stack-source \
# localhost/distribution-dell:dev if building / testing locally
-e INFERENCE_MODEL=$INFERENCE_MODEL \
-e DEH_URL=$DEH_URL \
-e CHROMA_URL=$CHROMA_URL \
llamastack/distribution-{{ name }}\
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env CHROMA_URL=$CHROMA_URL
--port $LLAMA_STACK_PORT
```
@ -142,14 +142,14 @@ docker run \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v $HOME/.llama:/root/.llama \
-v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-run.yaml \
-e INFERENCE_MODEL=$INFERENCE_MODEL \
-e DEH_URL=$DEH_URL \
-e SAFETY_MODEL=$SAFETY_MODEL \
-e DEH_SAFETY_URL=$DEH_SAFETY_URL \
-e CHROMA_URL=$CHROMA_URL \
llamastack/distribution-{{ name }} \
--config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env DEH_SAFETY_URL=$DEH_SAFETY_URL \
--env CHROMA_URL=$CHROMA_URL
--port $LLAMA_STACK_PORT
```
### Via Conda
@ -158,21 +158,21 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash
llama stack build --distro {{ name }} --image-type conda
llama stack run {{ name }}
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env CHROMA_URL=$CHROMA_URL
INFERENCE_MODEL=$INFERENCE_MODEL \
DEH_URL=$DEH_URL \
CHROMA_URL=$CHROMA_URL \
llama stack run {{ name }} \
--port $LLAMA_STACK_PORT
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
INFERENCE_MODEL=$INFERENCE_MODEL \
DEH_URL=$DEH_URL \
SAFETY_MODEL=$SAFETY_MODEL \
DEH_SAFETY_URL=$DEH_SAFETY_URL \
CHROMA_URL=$CHROMA_URL \
llama stack run ./run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env DEH_SAFETY_URL=$DEH_SAFETY_URL \
--env CHROMA_URL=$CHROMA_URL
--port $LLAMA_STACK_PORT
```

View file

@ -72,9 +72,9 @@ docker run \
--gpu all \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
llamastack/distribution-{{ name }} \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
--port $LLAMA_STACK_PORT
```
If you are using Llama Stack Safety / Shield APIs, use:
@ -86,10 +86,10 @@ docker run \
--gpu all \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
-e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
llamastack/distribution-{{ name }} \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
--port $LLAMA_STACK_PORT
```
### Via venv
@ -98,16 +98,16 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
```bash
llama stack build --distro {{ name }} --image-type venv
INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
llama stack run distributions/{{ name }}/run.yaml \
--port 8321 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
--port 8321
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
llama stack run distributions/{{ name }}/run-with-safety.yaml \
--port 8321 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
--port 8321
```

View file

@ -118,10 +118,10 @@ docker run \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
-e NVIDIA_API_KEY=$NVIDIA_API_KEY \
llamastack/distribution-{{ name }} \
--config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
--port $LLAMA_STACK_PORT
```
### Via venv
@ -131,10 +131,10 @@ If you've set up your local development environment, you can also build the imag
```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
llama stack build --distro nvidia --image-type venv
NVIDIA_API_KEY=$NVIDIA_API_KEY \
INFERENCE_MODEL=$INFERENCE_MODEL \
llama stack run ./run.yaml \
--port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--env INFERENCE_MODEL=$INFERENCE_MODEL
--port 8321
```
## Example Notebooks

View file

@ -3,3 +3,5 @@
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .watsonx import get_distribution_template # noqa: F401

View file

@ -3,44 +3,33 @@ distribution_spec:
description: Use watsonx for running LLM inference
providers:
inference:
- provider_id: watsonx
provider_type: remote::watsonx
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
- provider_type: remote::watsonx
- provider_type: inline::sentence-transformers
vector_io:
- provider_id: faiss
provider_type: inline::faiss
- provider_type: inline::faiss
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
- provider_type: inline::llama-guard
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
- provider_type: inline::meta-reference
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
- provider_type: inline::meta-reference
eval:
- provider_id: meta-reference
provider_type: inline::meta-reference
- provider_type: inline::meta-reference
datasetio:
- provider_id: huggingface
provider_type: remote::huggingface
- provider_id: localfs
provider_type: inline::localfs
- provider_type: remote::huggingface
- provider_type: inline::localfs
scoring:
- provider_id: basic
provider_type: inline::basic
- provider_id: llm-as-judge
provider_type: inline::llm-as-judge
- provider_id: braintrust
provider_type: inline::braintrust
- provider_type: inline::basic
- provider_type: inline::llm-as-judge
- provider_type: inline::braintrust
tool_runtime:
- provider_type: remote::brave-search
- provider_type: remote::tavily-search
- provider_type: inline::rag-runtime
- provider_type: remote::model-context-protocol
files:
- provider_type: inline::localfs
image_type: venv
additional_pip_packages:
- aiosqlite
- sqlalchemy[asyncio]
- aiosqlite
- aiosqlite

View file

@ -4,13 +4,13 @@ apis:
- agents
- datasetio
- eval
- files
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
- files
providers:
inference:
- provider_id: watsonx
@ -19,8 +19,6 @@ providers:
url: ${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}
api_key: ${env.WATSONX_API_KEY:=}
project_id: ${env.WATSONX_PROJECT_ID:=}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
vector_io:
- provider_id: faiss
provider_type: inline::faiss
@ -48,7 +46,7 @@ providers:
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
sinks: ${env.TELEMETRY_SINKS:=sqlite}
sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/watsonx}/trace_store.db
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
eval:
@ -109,102 +107,7 @@ metadata_store:
inference_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/watsonx}/inference_store.db
models:
- metadata: {}
model_id: meta-llama/llama-3-3-70b-instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-3-70b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-3.3-70B-Instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-3-70b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/llama-2-13b-chat
provider_id: watsonx
provider_model_id: meta-llama/llama-2-13b-chat
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-2-13b
provider_id: watsonx
provider_model_id: meta-llama/llama-2-13b-chat
model_type: llm
- metadata: {}
model_id: meta-llama/llama-3-1-70b-instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-1-70b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-3.1-70B-Instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-1-70b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/llama-3-1-8b-instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-1-8b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-3.1-8B-Instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-1-8b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/llama-3-2-11b-vision-instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-2-11b-vision-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-2-11b-vision-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/llama-3-2-1b-instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-2-1b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-3.2-1B-Instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-2-1b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/llama-3-2-3b-instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-2-3b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-3.2-3B-Instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-2-3b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/llama-3-2-90b-vision-instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-2-90b-vision-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-3.2-90B-Vision-Instruct
provider_id: watsonx
provider_model_id: meta-llama/llama-3-2-90b-vision-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/llama-guard-3-11b-vision
provider_id: watsonx
provider_model_id: meta-llama/llama-guard-3-11b-vision
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-Guard-3-11B-Vision
provider_id: watsonx
provider_model_id: meta-llama/llama-guard-3-11b-vision
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
models: []
shields: []
vector_dbs: []
datasets: []

View file

@ -4,17 +4,11 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.apis.models import ModelType
from llama_stack.core.datatypes import BuildProvider, ModelInput, Provider, ToolGroupInput
from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings, get_model_registry
from llama_stack.core.datatypes import BuildProvider, Provider, ToolGroupInput
from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
from llama_stack.providers.inline.inference.sentence_transformers import (
SentenceTransformersInferenceConfig,
)
from llama_stack.providers.remote.inference.watsonx import WatsonXConfig
from llama_stack.providers.remote.inference.watsonx.models import MODEL_ENTRIES
def get_distribution_template(name: str = "watsonx") -> DistributionTemplate:
@ -52,15 +46,6 @@ def get_distribution_template(name: str = "watsonx") -> DistributionTemplate:
config=WatsonXConfig.sample_run_config(),
)
embedding_provider = Provider(
provider_id="sentence-transformers",
provider_type="inline::sentence-transformers",
config=SentenceTransformersInferenceConfig.sample_run_config(),
)
available_models = {
"watsonx": MODEL_ENTRIES,
}
default_tool_groups = [
ToolGroupInput(
toolgroup_id="builtin::websearch",
@ -72,36 +57,25 @@ def get_distribution_template(name: str = "watsonx") -> DistributionTemplate:
),
]
embedding_model = ModelInput(
model_id="all-MiniLM-L6-v2",
provider_id="sentence-transformers",
model_type=ModelType.embedding,
metadata={
"embedding_dimension": 384,
},
)
files_provider = Provider(
provider_id="meta-reference-files",
provider_type="inline::localfs",
config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"),
)
default_models, _ = get_model_registry(available_models)
return DistributionTemplate(
name=name,
distro_type="remote_hosted",
description="Use watsonx for running LLM inference",
container_image=None,
template_path=Path(__file__).parent / "doc_template.md",
template_path=None,
providers=providers,
available_models_by_provider=available_models,
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider, embedding_provider],
"inference": [inference_provider],
"files": [files_provider],
},
default_models=default_models + [embedding_model],
default_models=[],
default_tool_groups=default_tool_groups,
),
},

View file

@ -31,12 +31,17 @@ CATEGORIES = [
"client",
"telemetry",
"openai_responses",
"openai_conversations",
"testing",
"providers",
"models",
"files",
"vector_io",
"tool_runtime",
"cli",
"post_training",
"scoring",
"tests",
]
UNCATEGORIZED = "uncategorized"
@ -128,7 +133,10 @@ def strip_rich_markup(text):
class CustomRichHandler(RichHandler):
def __init__(self, *args, **kwargs):
kwargs["console"] = Console()
# Set a reasonable default width for console output, especially when redirected to files
console_width = int(os.environ.get("LLAMA_STACK_LOG_WIDTH", "120"))
# Don't force terminal codes to avoid ANSI escape codes in log files
kwargs["console"] = Console(width=console_width)
super().__init__(*args, **kwargs)
def emit(self, record):
@ -261,11 +269,12 @@ def get_logger(
if root_category in _category_levels:
log_level = _category_levels[root_category]
else:
log_level = _category_levels.get("root", DEFAULT_LOG_LEVEL)
if category != UNCATEGORIZED:
logging.warning(
f"Unknown logging category: {category}. Falling back to default 'root' level: {log_level}"
raise ValueError(
f"Unknown logging category: {category}. To resolve, choose a valid category from the CATEGORIES list "
f"or add it to the CATEGORIES list. Available categories: {CATEGORIES}"
)
log_level = _category_levels.get("root", DEFAULT_LOG_LEVEL)
logger.setLevel(log_level)
return logging.LoggerAdapter(logger, {"category": category})

View file

@ -11,19 +11,13 @@
# top-level folder for each specific model found within the models/ directory at
# the top-level of this source tree.
import json
import textwrap
from pathlib import Path
from pydantic import BaseModel, Field
from llama_stack.models.llama.datatypes import (
RawContent,
RawMediaItem,
RawMessage,
RawTextItem,
StopReason,
ToolCall,
ToolPromptFormat,
)
from llama_stack.models.llama.llama4.tokenizer import Tokenizer
@ -175,25 +169,6 @@ def llama3_1_builtin_code_interpreter_dialog(tool_prompt_format=ToolPromptFormat
return messages
def llama3_1_builtin_tool_call_with_image_dialog(
tool_prompt_format=ToolPromptFormat.json,
):
this_dir = Path(__file__).parent
with open(this_dir / "llama3/dog.jpg", "rb") as f:
img = f.read()
interface = LLama31Interface(tool_prompt_format)
messages = interface.system_messages(**system_message_builtin_tools_only())
messages += interface.user_message(content=[RawMediaItem(data=img), RawTextItem(text="What is this dog breed?")])
messages += interface.assistant_response_messages(
"Based on the description of the dog in the image, it appears to be a small breed dog, possibly a terrier mix",
StopReason.end_of_turn,
)
messages += interface.user_message("Search the web for some food recommendations for the indentified breed")
return messages
def llama3_1_custom_tool_call_dialog(tool_prompt_format=ToolPromptFormat.json):
interface = LLama31Interface(tool_prompt_format)
@ -202,35 +177,6 @@ def llama3_1_custom_tool_call_dialog(tool_prompt_format=ToolPromptFormat.json):
return messages
def llama3_1_e2e_tool_call_dialog(tool_prompt_format=ToolPromptFormat.json):
tool_response = json.dumps(["great song1", "awesome song2", "cool song3"])
interface = LLama31Interface(tool_prompt_format)
messages = interface.system_messages(**system_message_custom_tools_only())
messages += interface.user_message(content="Use tools to get latest trending songs")
messages.append(
RawMessage(
role="assistant",
content="",
stop_reason=StopReason.end_of_message,
tool_calls=[
ToolCall(
call_id="call_id",
tool_name="trending_songs",
arguments={"n": "10", "genre": "latest"},
)
],
),
)
messages.append(
RawMessage(
role="assistant",
content=tool_response,
)
)
return messages
def llama3_2_user_assistant_conversation():
return UseCase(
title="User and assistant conversation",

View file

@ -9,7 +9,7 @@ from pathlib import Path
from llama_stack.log import get_logger
logger = get_logger(__name__, "tokenizer_utils")
logger = get_logger(__name__, "models")
def load_bpe_file(model_path: Path) -> dict[bytes, int]:

View file

@ -22,6 +22,7 @@ async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: dict[Ap
deps[Api.tool_runtime],
deps[Api.tool_groups],
policy,
Api.telemetry in deps,
)
await impl.initialize()
return impl

View file

@ -7,8 +7,6 @@
import copy
import json
import re
import secrets
import string
import uuid
import warnings
from collections.abc import AsyncGenerator
@ -84,11 +82,6 @@ from llama_stack.providers.utils.telemetry import tracing
from .persistence import AgentPersistence
from .safety import SafetyException, ShieldRunnerMixin
def make_random_string(length: int = 8):
return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length))
TOOLS_ATTACHMENT_KEY_REGEX = re.compile(r"__tools_attachment__=(\{.*?\})")
MEMORY_QUERY_TOOL = "knowledge_search"
WEB_SEARCH_TOOL = "web_search"
@ -110,6 +103,7 @@ class ChatAgent(ShieldRunnerMixin):
persistence_store: KVStore,
created_at: str,
policy: list[AccessRule],
telemetry_enabled: bool = False,
):
self.agent_id = agent_id
self.agent_config = agent_config
@ -120,6 +114,7 @@ class ChatAgent(ShieldRunnerMixin):
self.tool_runtime_api = tool_runtime_api
self.tool_groups_api = tool_groups_api
self.created_at = created_at
self.telemetry_enabled = telemetry_enabled
ShieldRunnerMixin.__init__(
self,
@ -188,28 +183,30 @@ class ChatAgent(ShieldRunnerMixin):
async def create_and_execute_turn(self, request: AgentTurnCreateRequest) -> AsyncGenerator:
turn_id = str(uuid.uuid4())
span = tracing.get_current_span()
if span:
span.set_attribute("session_id", request.session_id)
span.set_attribute("agent_id", self.agent_id)
span.set_attribute("request", request.model_dump_json())
span.set_attribute("turn_id", turn_id)
if self.agent_config.name:
span.set_attribute("agent_name", self.agent_config.name)
if self.telemetry_enabled:
span = tracing.get_current_span()
if span is not None:
span.set_attribute("session_id", request.session_id)
span.set_attribute("agent_id", self.agent_id)
span.set_attribute("request", request.model_dump_json())
span.set_attribute("turn_id", turn_id)
if self.agent_config.name:
span.set_attribute("agent_name", self.agent_config.name)
await self._initialize_tools(request.toolgroups)
async for chunk in self._run_turn(request, turn_id):
yield chunk
async def resume_turn(self, request: AgentTurnResumeRequest) -> AsyncGenerator:
span = tracing.get_current_span()
if span:
span.set_attribute("agent_id", self.agent_id)
span.set_attribute("session_id", request.session_id)
span.set_attribute("request", request.model_dump_json())
span.set_attribute("turn_id", request.turn_id)
if self.agent_config.name:
span.set_attribute("agent_name", self.agent_config.name)
if self.telemetry_enabled:
span = tracing.get_current_span()
if span is not None:
span.set_attribute("agent_id", self.agent_id)
span.set_attribute("session_id", request.session_id)
span.set_attribute("request", request.model_dump_json())
span.set_attribute("turn_id", request.turn_id)
if self.agent_config.name:
span.set_attribute("agent_name", self.agent_config.name)
await self._initialize_tools()
async for chunk in self._run_turn(request):
@ -395,9 +392,12 @@ class ChatAgent(ShieldRunnerMixin):
touchpoint: str,
) -> AsyncGenerator:
async with tracing.span("run_shields") as span:
span.set_attribute("input", [m.model_dump_json() for m in messages])
if self.telemetry_enabled and span is not None:
span.set_attribute("input", [m.model_dump_json() for m in messages])
if len(shields) == 0:
span.set_attribute("output", "no shields")
if len(shields) == 0:
span.set_attribute("output", "no shields")
return
step_id = str(uuid.uuid4())
@ -430,7 +430,8 @@ class ChatAgent(ShieldRunnerMixin):
)
)
)
span.set_attribute("output", e.violation.model_dump_json())
if self.telemetry_enabled and span is not None:
span.set_attribute("output", e.violation.model_dump_json())
yield CompletionMessage(
content=str(e),
@ -453,7 +454,8 @@ class ChatAgent(ShieldRunnerMixin):
)
)
)
span.set_attribute("output", "no violations")
if self.telemetry_enabled and span is not None:
span.set_attribute("output", "no violations")
async def _run(
self,
@ -518,8 +520,9 @@ class ChatAgent(ShieldRunnerMixin):
stop_reason: StopReason | None = None
async with tracing.span("inference") as span:
if self.agent_config.name:
span.set_attribute("agent_name", self.agent_config.name)
if self.telemetry_enabled and span is not None:
if self.agent_config.name:
span.set_attribute("agent_name", self.agent_config.name)
def _serialize_nested(value):
"""Recursively serialize nested Pydantic models to dicts."""
@ -637,18 +640,19 @@ class ChatAgent(ShieldRunnerMixin):
else:
raise ValueError(f"Unexpected delta type {type(delta)}")
span.set_attribute("stop_reason", stop_reason or StopReason.end_of_turn)
span.set_attribute(
"input",
json.dumps([json.loads(m.model_dump_json()) for m in input_messages]),
)
output_attr = json.dumps(
{
"content": content,
"tool_calls": [json.loads(t.model_dump_json()) for t in tool_calls],
}
)
span.set_attribute("output", output_attr)
if self.telemetry_enabled and span is not None:
span.set_attribute("stop_reason", stop_reason or StopReason.end_of_turn)
span.set_attribute(
"input",
json.dumps([json.loads(m.model_dump_json()) for m in input_messages]),
)
output_attr = json.dumps(
{
"content": content,
"tool_calls": [json.loads(t.model_dump_json()) for t in tool_calls],
}
)
span.set_attribute("output", output_attr)
n_iter += 1
await self.storage.set_num_infer_iters_in_turn(session_id, turn_id, n_iter)
@ -756,7 +760,9 @@ class ChatAgent(ShieldRunnerMixin):
{
"tool_name": tool_call.tool_name,
"input": message.model_dump_json(),
},
}
if self.telemetry_enabled
else {},
) as span:
tool_execution_start_time = datetime.now(UTC).isoformat()
tool_result = await self.execute_tool_call_maybe(
@ -771,7 +777,8 @@ class ChatAgent(ShieldRunnerMixin):
call_id=tool_call.call_id,
content=tool_result.content,
)
span.set_attribute("output", result_message.model_dump_json())
if self.telemetry_enabled and span is not None:
span.set_attribute("output", result_message.model_dump_json())
# Store tool execution step
tool_execution_step = ToolExecutionStep(

View file

@ -64,6 +64,7 @@ class MetaReferenceAgentsImpl(Agents):
tool_runtime_api: ToolRuntime,
tool_groups_api: ToolGroups,
policy: list[AccessRule],
telemetry_enabled: bool = False,
):
self.config = config
self.inference_api = inference_api
@ -71,6 +72,7 @@ class MetaReferenceAgentsImpl(Agents):
self.safety_api = safety_api
self.tool_runtime_api = tool_runtime_api
self.tool_groups_api = tool_groups_api
self.telemetry_enabled = telemetry_enabled
self.in_memory_store = InmemoryKVStoreImpl()
self.openai_responses_impl: OpenAIResponsesImpl | None = None
@ -135,6 +137,7 @@ class MetaReferenceAgentsImpl(Agents):
),
created_at=agent_info.created_at,
policy=self.policy,
telemetry_enabled=self.telemetry_enabled,
)
async def create_agent_session(

View file

@ -269,7 +269,7 @@ class OpenAIResponsesImpl:
response_tools=tools,
temperature=temperature,
response_format=response_format,
inputs=input,
inputs=all_input,
)
# Create orchestrator and delegate streaming logic

View file

@ -97,6 +97,8 @@ class StreamingResponseOrchestrator:
self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = {}
# Track final messages after all tool executions
self.final_messages: list[OpenAIMessageParam] = []
# mapping for annotations
self.citation_files: dict[str, str] = {}
async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
# Initialize output messages
@ -126,6 +128,7 @@ class StreamingResponseOrchestrator:
# Text is the default response format for chat completion so don't need to pass it
# (some providers don't support non-empty response_format when tools are present)
response_format = None if self.ctx.response_format.type == "text" else self.ctx.response_format
logger.debug(f"calling openai_chat_completion with tools: {self.ctx.chat_tools}")
completion_result = await self.inference_api.openai_chat_completion(
model=self.ctx.model,
messages=messages,
@ -160,7 +163,7 @@ class StreamingResponseOrchestrator:
# Handle choices with no tool calls
for choice in current_response.choices:
if not (choice.message.tool_calls and self.ctx.response_tools):
output_messages.append(await convert_chat_choice_to_response_message(choice))
output_messages.append(await convert_chat_choice_to_response_message(choice, self.citation_files))
# Execute tool calls and coordinate results
async for stream_event in self._coordinate_tool_execution(
@ -172,6 +175,8 @@ class StreamingResponseOrchestrator:
):
yield stream_event
messages = next_turn_messages
if not function_tool_calls and not non_function_tool_calls:
break
@ -184,9 +189,7 @@ class StreamingResponseOrchestrator:
logger.info(f"Exiting inference loop since iteration count({n_iter}) exceeds {self.max_infer_iters=}")
break
messages = next_turn_messages
self.final_messages = messages.copy() + [current_response.choices[0].message]
self.final_messages = messages.copy()
# Create final response
final_response = OpenAIResponseObject(
@ -211,6 +214,8 @@ class StreamingResponseOrchestrator:
for choice in current_response.choices:
next_turn_messages.append(choice.message)
logger.debug(f"Choice message content: {choice.message.content}")
logger.debug(f"Choice message tool_calls: {choice.message.tool_calls}")
if choice.message.tool_calls and self.ctx.response_tools:
for tool_call in choice.message.tool_calls:
@ -227,9 +232,11 @@ class StreamingResponseOrchestrator:
non_function_tool_calls.append(tool_call)
else:
logger.info(f"Approval denied for {tool_call.id} on {tool_call.function.name}")
next_turn_messages.pop()
else:
logger.info(f"Requesting approval for {tool_call.id} on {tool_call.function.name}")
approvals.append(tool_call)
next_turn_messages.pop()
else:
non_function_tool_calls.append(tool_call)
@ -470,6 +477,8 @@ class StreamingResponseOrchestrator:
tool_call_log = result.final_output_message
tool_response_message = result.final_input_message
self.sequence_number = result.sequence_number
if result.citation_files:
self.citation_files.update(result.citation_files)
if tool_call_log:
output_messages.append(tool_call_log)

View file

@ -94,7 +94,10 @@ class ToolExecutor:
# Yield the final result
yield ToolExecutionResult(
sequence_number=sequence_number, final_output_message=output_message, final_input_message=input_message
sequence_number=sequence_number,
final_output_message=output_message,
final_input_message=input_message,
citation_files=result.metadata.get("citation_files") if result and result.metadata else None,
)
async def _execute_knowledge_search_via_vector_store(
@ -129,8 +132,6 @@ class ToolExecutor:
for results in all_results:
search_results.extend(results)
# Convert search results to tool result format matching memory.py
# Format the results as interleaved content similar to memory.py
content_items = []
content_items.append(
TextContentItem(
@ -138,27 +139,58 @@ class ToolExecutor:
)
)
unique_files = set()
for i, result_item in enumerate(search_results):
chunk_text = result_item.content[0].text if result_item.content else ""
metadata_text = f"document_id: {result_item.file_id}, score: {result_item.score}"
# Get file_id from attributes if result_item.file_id is empty
file_id = result_item.file_id or (
result_item.attributes.get("document_id") if result_item.attributes else None
)
metadata_text = f"document_id: {file_id}, score: {result_item.score}"
if result_item.attributes:
metadata_text += f", attributes: {result_item.attributes}"
text_content = f"[{i + 1}] {metadata_text}\n{chunk_text}\n"
text_content = f"[{i + 1}] {metadata_text} (cite as <|{file_id}|>)\n{chunk_text}\n"
content_items.append(TextContentItem(text=text_content))
unique_files.add(file_id)
content_items.append(TextContentItem(text="END of knowledge_search tool results.\n"))
citation_instruction = ""
if unique_files:
citation_instruction = (
" Cite sources immediately at the end of sentences before punctuation, using `<|file-id|>` format (e.g., 'This is a fact <|file-Cn3MSNn72ENTiiq11Qda4A|>.'). "
"Do not add extra punctuation. Use only the file IDs provided (do not invent new ones)."
)
content_items.append(
TextContentItem(
text=f'The above results were retrieved to help answer the user\'s query: "{query}". Use them as supporting information only in answering this query.\n',
text=f'The above results were retrieved to help answer the user\'s query: "{query}". Use them as supporting information only in answering this query.{citation_instruction}\n',
)
)
# handling missing attributes for old versions
citation_files = {}
for result in search_results:
file_id = result.file_id
if not file_id and result.attributes:
file_id = result.attributes.get("document_id")
filename = result.filename
if not filename and result.attributes:
filename = result.attributes.get("filename")
if not filename:
filename = "unknown"
citation_files[file_id] = filename
return ToolInvocationResult(
content=content_items,
metadata={
"document_ids": [r.file_id for r in search_results],
"chunks": [r.content[0].text if r.content else "" for r in search_results],
"scores": [r.score for r in search_results],
"citation_files": citation_files,
},
)

View file

@ -27,6 +27,7 @@ class ToolExecutionResult(BaseModel):
sequence_number: int
final_output_message: OpenAIResponseOutput | None = None
final_input_message: OpenAIMessageParam | None = None
citation_files: dict[str, str] | None = None
@dataclass

View file

@ -4,9 +4,11 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import re
import uuid
from llama_stack.apis.agents.openai_responses import (
OpenAIResponseAnnotationFileCitation,
OpenAIResponseInput,
OpenAIResponseInputFunctionToolCallOutput,
OpenAIResponseInputMessageContent,
@ -45,7 +47,9 @@ from llama_stack.apis.inference import (
)
async def convert_chat_choice_to_response_message(choice: OpenAIChoice) -> OpenAIResponseMessage:
async def convert_chat_choice_to_response_message(
choice: OpenAIChoice, citation_files: dict[str, str] | None = None
) -> OpenAIResponseMessage:
"""Convert an OpenAI Chat Completion choice into an OpenAI Response output message."""
output_content = ""
if isinstance(choice.message.content, str):
@ -57,9 +61,11 @@ async def convert_chat_choice_to_response_message(choice: OpenAIChoice) -> OpenA
f"Llama Stack OpenAI Responses does not yet support output content type: {type(choice.message.content)}"
)
annotations, clean_text = _extract_citations_from_text(output_content, citation_files or {})
return OpenAIResponseMessage(
id=f"msg_{uuid.uuid4()}",
content=[OpenAIResponseOutputMessageContentOutputText(text=output_content)],
content=[OpenAIResponseOutputMessageContentOutputText(text=clean_text, annotations=annotations)],
status="completed",
role="assistant",
)
@ -200,6 +206,53 @@ async def get_message_type_by_role(role: str):
return role_to_type.get(role)
def _extract_citations_from_text(
text: str, citation_files: dict[str, str]
) -> tuple[list[OpenAIResponseAnnotationFileCitation], str]:
"""Extract citation markers from text and create annotations
Args:
text: The text containing citation markers like [file-Cn3MSNn72ENTiiq11Qda4A]
citation_files: Dictionary mapping file_id to filename
Returns:
Tuple of (annotations_list, clean_text_without_markers)
"""
file_id_regex = re.compile(r"<\|(?P<file_id>file-[A-Za-z0-9_-]+)\|>")
annotations = []
parts = []
total_len = 0
last_end = 0
for m in file_id_regex.finditer(text):
# segment before the marker
prefix = text[last_end : m.start()]
# drop one space if it exists (since marker is at sentence end)
if prefix.endswith(" "):
prefix = prefix[:-1]
parts.append(prefix)
total_len += len(prefix)
fid = m.group(1)
if fid in citation_files:
annotations.append(
OpenAIResponseAnnotationFileCitation(
file_id=fid,
filename=citation_files[fid],
index=total_len, # index points to punctuation
)
)
last_end = m.end()
parts.append(text[last_end:])
cleaned_text = "".join(parts)
return annotations, cleaned_text
def is_function_tool_call(
tool_call: OpenAIChatCompletionToolCall,
tools: list[OpenAIResponseInputTool],

View file

@ -8,8 +8,6 @@ import asyncio
import base64
import io
import mimetypes
import secrets
import string
from typing import Any
import httpx
@ -52,10 +50,6 @@ from .context_retriever import generate_rag_query
log = get_logger(name=__name__, category="tool_runtime")
def make_random_string(length: int = 8):
return "".join(secrets.choice(string.ascii_letters + string.digits) for _ in range(length))
async def raw_data_from_doc(doc: RAGDocument) -> tuple[bytes, str]:
"""Get raw binary data and mime type from a RAGDocument for file upload."""
if isinstance(doc.content, URL):
@ -331,5 +325,8 @@ class MemoryToolRuntimeImpl(ToolGroupsProtocolPrivate, ToolRuntime, RAGToolRunti
return ToolInvocationResult(
content=result.content or [],
metadata=result.metadata,
metadata={
**(result.metadata or {}),
"citation_files": getattr(result, "citation_files", None),
},
)

View file

@ -200,12 +200,10 @@ class FaissIndex(EmbeddingIndex):
class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPrivate):
def __init__(self, config: FaissVectorIOConfig, inference_api: Inference, files_api: Files | None) -> None:
super().__init__(files_api=files_api, kvstore=None)
self.config = config
self.inference_api = inference_api
self.files_api = files_api
self.cache: dict[str, VectorDBWithIndex] = {}
self.kvstore: KVStore | None = None
self.openai_vector_stores: dict[str, dict[str, Any]] = {}
async def initialize(self) -> None:
self.kvstore = await kvstore_impl(self.config.kvstore)
@ -227,8 +225,8 @@ class FaissVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorDBsProtocolPr
await self.initialize_openai_vector_stores()
async def shutdown(self) -> None:
# Cleanup if needed
pass
# Clean up mixin resources (file batch tasks)
await super().shutdown()
async def health(self) -> HealthResponse:
"""

Some files were not shown because too many files have changed in this diff Show more