Merged from main + fixed elasticsearch_url

This commit is contained in:
Enrico Zimuel 2025-11-19 13:15:09 +01:00
commit 7034637cac
No known key found for this signature in database
GPG key ID: 6CB203F6934A69F1
594 changed files with 79447 additions and 35172 deletions

2
.github/CODEOWNERS vendored
View file

@ -2,4 +2,4 @@
# These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence,
* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo
* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @leseb @bbrowning @reluctantfuturist @mattf @slekkala1 @franciscojavierarceo

View file

@ -39,6 +39,32 @@ runs:
if: ${{ inputs.setup == 'vllm' && inputs.inference-mode == 'record' }}
uses: ./.github/actions/setup-vllm
- name: Start Postgres service
if: ${{ contains(inputs.setup, 'postgres') }}
shell: bash
run: |
sudo docker rm -f postgres-ci || true
sudo docker run -d --name postgres-ci \
-e POSTGRES_USER=llamastack \
-e POSTGRES_PASSWORD=llamastack \
-e POSTGRES_DB=llamastack \
-p 5432:5432 \
postgres:16
echo "Waiting for Postgres to become ready..."
for i in {1..30}; do
if sudo docker exec postgres-ci pg_isready -U llamastack -d llamastack >/dev/null 2>&1; then
echo "Postgres is ready"
break
fi
if [ "$i" -eq 30 ]; then
echo "Postgres failed to start in time"
sudo docker logs postgres-ci || true
exit 1
fi
sleep 2
done
- name: Build Llama Stack
shell: bash
run: |

View file

@ -66,12 +66,12 @@ jobs:
run-replay-mode-tests:
needs: generate-matrix
runs-on: ubuntu-latest
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client-type, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}
name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, {4})', matrix.client, matrix.config.setup, matrix.python-version, matrix.client-version, matrix.config.suite) }}
strategy:
fail-fast: false
matrix:
client-type: [library, docker, server]
client: [library, docker, server]
# Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
@ -84,6 +84,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
- name: Setup test environment
if: ${{ matrix.config.allowed_clients == null || contains(matrix.config.allowed_clients, matrix.client) }}
uses: ./.github/actions/setup-test-environment
with:
python-version: ${{ matrix.python-version }}
@ -93,11 +94,16 @@ jobs:
inference-mode: 'replay'
- name: Run tests
if: ${{ matrix.config.allowed_clients == null || contains(matrix.config.allowed_clients, matrix.client) }}
uses: ./.github/actions/run-and-record-tests
env:
OPENAI_API_KEY: dummy
with:
stack-config: ${{ matrix.client-type == 'library' && 'ci-tests' || matrix.client-type == 'server' && 'server:ci-tests' || 'docker:ci-tests' }}
stack-config: >-
${{ matrix.config.stack_config
|| (matrix.client == 'library' && 'ci-tests')
|| (matrix.client == 'server' && 'server:ci-tests')
|| 'docker:ci-tests' }}
setup: ${{ matrix.config.setup }}
inference-mode: 'replay'
suite: ${{ matrix.config.suite }}

View file

@ -53,7 +53,7 @@ jobs:
working-directory: src/llama_stack_ui
- name: Install pre-commit
run: python -m pip install pre-commit
run: python -m pip install 'pre-commit>=4.4.0'
- name: Cache pre-commit
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4

View file

@ -30,13 +30,16 @@ jobs:
activate-environment: true
version: 0.7.6
- name: Build Llama Stack package
run: |
uv build
- name: Build Llama Stack API package
working-directory: src/llama_stack_api
run: uv build
- name: Install Llama Stack package
- name: Build Llama Stack package
run: uv build
- name: Install Llama Stack package (with api stubs from local build)
run: |
uv pip install dist/*.whl
uv pip install --find-links src/llama_stack_api/dist dist/*.whl
- name: Verify Llama Stack package
run: |
@ -45,3 +48,4 @@ jobs:
command -v llama
llama stack list-apis
llama stack list-providers inference
llama stack list-deps starter

View file

@ -1,5 +1,5 @@
exclude: 'build/'
minimum_pre_commit_version: 4.4.0
default_language_version:
python: python3.12
node: "22"
@ -42,7 +42,6 @@ repos:
hooks:
- id: ruff
args: [ --fix ]
exclude: ^src/llama_stack/strong_typing/.*$
- id: ruff-format
- repo: https://github.com/adamchainz/blacken-docs
@ -106,16 +105,16 @@ repos:
language: python
pass_filenames: false
require_serial: true
files: ^src/llama_stack/providers/.*$
files: ^src/llama_stack/providers/.*$|^scripts/run_openapi_generator.sh$
- id: openapi-codegen
name: API Spec Codegen
additional_dependencies:
- uv==0.7.8
entry: sh -c './scripts/uv-run-with-index.sh run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
entry: sh -c './scripts/uv-run-with-index.sh run scripts/run_openapi_generator.sh'
language: python
pass_filenames: false
require_serial: true
files: ^src/llama_stack/apis/|^docs/openapi_generator/
files: ^src/llama_stack_api/.*$
- id: check-workflows-use-hashes
name: Check GitHub Actions use SHA-pinned actions
entry: ./scripts/check-workflows-use-hashes.sh
@ -200,6 +199,27 @@ repos:
echo;
exit 1;
} || true
- id: check-api-independence
name: Ensure llama_stack_api does not import llama_stack
entry: bash
language: system
pass_filenames: false
require_serial: true
always_run: true
files: ^src/llama_stack_api/.*$
args:
- -c
- |
API_DIR="src/llama_stack_api"
grep -rn --include="*.py" -E '^[^#]*(import llama_stack\b|from llama_stack\b)' "$API_DIR" 2>/dev/null && {
echo "llama_stack_api must not import llama_stack";
exit 1;
}
[ -f "$API_DIR/pyproject.toml" ] && grep -n 'llama_stack[^_]' "$API_DIR/pyproject.toml" && {
echo "llama_stack_api must not depend on llama_stack in pyproject.toml";
exit 1;
}
exit 0
ci:
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks

View file

@ -231,7 +231,7 @@ npm run serve
If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
```bash
uv run ./docs/openapi_generator/run_openapi_generator.sh
uv run ./scripts/run_openapi_generator.sh
```
The generated API schema will be available in `docs/static/`. Make sure to review the changes before committing.

View file

@ -10,83 +10,6 @@
[**Quick Start**](https://llamastack.github.io/docs/getting_started/quickstart) | [**Documentation**](https://llamastack.github.io/docs) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
### ✨🎉 Llama 4 Support 🎉✨
We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
<details>
<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
\
*Note you need 8xH100 GPU-host to run these models*
```bash
pip install -U llama_stack
MODEL="Llama-4-Scout-17B-16E-Instruct"
# get meta url from llama.com
huggingface-cli download meta-llama/$MODEL --local-dir ~/.llama/$MODEL
# install dependencies for the distribution
llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
# start a llama stack server
INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu
# install client to interact with the server
pip install llama-stack-client
```
### CLI
```bash
# Run a chat completion
MODEL="Llama-4-Scout-17B-16E-Instruct"
llama-stack-client --endpoint http://localhost:8321 \
inference chat-completion \
--model-id meta-llama/$MODEL \
--message "write a haiku for meta's llama 4 models"
OpenAIChatCompletion(
...
choices=[
OpenAIChatCompletionChoice(
finish_reason='stop',
index=0,
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
role='assistant',
content='...**Silent minds awaken,** \n**Whispers of billions of words,** \n**Reasoning breaks the night.** \n\n— \n*This haiku blends the essence of LLaMA 4\'s capabilities with nature-inspired metaphor, evoking its vast training data and transformative potential.*',
...
),
...
)
],
...
)
```
### Python SDK
```python
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url=f"http://localhost:8321")
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
prompt = "Write a haiku about coding"
print(f"User> {prompt}")
response = client.chat.completions.create(
model=model_id,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
],
)
print(f"Assistant> {response.choices[0].message.content}")
```
As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
</details>
### 🚀 One-Line Installer 🚀
To try Llama Stack locally, run:

View file

@ -5,4 +5,7 @@ These are the source-of-truth configuration files used to generate the Stainless
A small side note: notice the `.yml` suffixes since Stainless uses that suffix typically for its configuration files.
These files go hand-in-hand. As of now, only the `openapi.yml` file is automatically generated using the `run_openapi_generator.sh` script.
These files go hand-in-hand. Both `openapi.yml` and `config.yml` are generated by `scripts/run_openapi_generator.sh`:
- `openapi.yml` comes from the FastAPI-based generator.
- `config.yml` is rendered from `scripts/openapi_generator/stainless_config/config_data.py` so the Stainless config stays in lock-step with the spec.

View file

@ -1,20 +1,16 @@
# yaml-language-server: $schema=https://app.stainlessapi.com/config-internal.schema.json
organization:
# Name of your organization or company, used to determine the name of the client
# and headings.
name: llama-stack-client
docs: https://llama-stack.readthedocs.io/en/latest/
contact: llamastack@meta.com
security:
- {}
- BearerAuth: []
- {}
- BearerAuth: []
security_schemes:
BearerAuth:
type: http
scheme: bearer
# `targets` define the output targets and their customization options, such as
# whether to emit the Node SDK and what it's package name should be.
targets:
node:
package_name: llama-stack-client
@ -40,27 +36,19 @@ targets:
options:
enable_v2: true
back_compat_use_shared_package: false
# `client_settings` define settings for the API client, such as extra constructor
# arguments (used for authentication), retry behavior, idempotency, etc.
client_settings:
default_env_prefix: LLAMA_STACK_CLIENT
opts:
api_key:
type: string
read_env: LLAMA_STACK_CLIENT_API_KEY
auth: { security_scheme: BearerAuth }
auth:
security_scheme: BearerAuth
nullable: true
# `environments` are a map of the name of the environment (e.g. "sandbox",
# "production") to the corresponding url to use.
environments:
production: http://any-hosted-llama-stack.com
# `pagination` defines [pagination schemes] which provides a template to match
# endpoints and generate next-page and auto-pagination helpers in the SDKs.
pagination:
- name: datasets_iterrows
- name: datasets_iterrows
type: offset
request:
dataset_id:
@ -80,7 +68,7 @@ pagination:
type: integer
x-stainless-pagination-property:
purpose: offset_count_start_field
- name: openai_cursor_page
- name: openai_cursor_page
type: cursor
request:
limit:
@ -99,12 +87,72 @@ pagination:
type: string
x-stainless-pagination-property:
purpose: next_cursor_field
# `resources` define the structure and organziation for your API, such as how
# methods and models are grouped together and accessed. See the [configuration
# guide] for more information.
#
# [configuration guide]:
# https://app.stainlessapi.com/docs/guides/configure#resources
settings:
license: MIT
unwrap_response_fields:
- data
file_header: 'Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.
This source code is licensed under the terms described in the LICENSE file in
the root directory of this source tree.
'
openapi:
transformations:
- command: mergeObject
reason: Better return_type using enum
args:
target:
- $.components.schemas
object:
ReturnType:
additionalProperties: false
properties:
type:
enum:
- string
- number
- boolean
- array
- object
- json
- union
- chat_completion_input
- completion_input
- agent_turn_input
required:
- type
type: object
- command: replaceProperties
reason: Replace return type properties with better model (see above)
args:
filter:
only:
- $.components.schemas.ScoringFn.properties.return_type
- $.components.schemas.RegisterScoringFunctionRequest.properties.return_type
value:
$ref: '#/components/schemas/ReturnType'
- command: oneOfToAnyOf
reason: Prism (mock server) doesn't like one of our requests as it technically
matches multiple variants
readme:
example_requests:
default:
type: request
endpoint: post /v1/chat/completions
params: {}
headline:
type: request
endpoint: get /v1/models
params: {}
pagination:
type: request
endpoint: post /v1/chat/completions
params: {}
resources:
$shared:
models:
@ -115,9 +163,6 @@ resources:
sampling_params: SamplingParams
scoring_result: ScoringResult
system_message: SystemMessage
query_result: RAGQueryResult
document: RAGDocument
query_config: RAGQueryConfig
toolgroups:
models:
tool_group: ToolGroup
@ -131,24 +176,17 @@ resources:
methods:
get: get /v1/tools/{tool_name}
list:
endpoint: get /v1/tools
paginated: false
endpoint: get /v1/tools
tool_runtime:
models:
tool_def: ToolDef
tool_invocation_result: ToolInvocationResult
methods:
list_tools:
endpoint: get /v1/tool-runtime/list-tools
paginated: false
endpoint: get /v1/tool-runtime/list-tools
invoke_tool: post /v1/tool-runtime/invoke
subresources:
rag_tool:
methods:
insert: post /v1/tool-runtime/rag-tool/insert
query: post /v1/tool-runtime/rag-tool/query
responses:
models:
response_object_stream: OpenAIResponseObjectStream
@ -156,10 +194,10 @@ resources:
methods:
create:
type: http
endpoint: post /v1/responses
streaming:
stream_event_model: responses.response_object_stream
param_discriminator: stream
endpoint: post /v1/responses
retrieve: get /v1/responses/{response_id}
list:
type: http
@ -172,8 +210,8 @@ resources:
methods:
list:
type: http
paginated: false
endpoint: get /v1/responses/{response_id}/input_items
prompts:
models:
prompt: Prompt
@ -181,8 +219,8 @@ resources:
methods:
create: post /v1/prompts
list:
endpoint: get /v1/prompts
paginated: false
endpoint: get /v1/prompts
retrieve: get /v1/prompts/{prompt_id}
update: post /v1/prompts/{prompt_id}
delete: delete /v1/prompts/{prompt_id}
@ -191,9 +229,8 @@ resources:
versions:
methods:
list:
endpoint: get /v1/prompts/{prompt_id}/versions
paginated: false
endpoint: get /v1/prompts/{prompt_id}/versions
conversations:
models:
conversation_object: Conversation
@ -220,7 +257,9 @@ resources:
create:
type: http
endpoint: post /v1/conversations/{conversation_id}/items
delete:
type: http
endpoint: delete /v1/conversations/{conversation_id}/items/{item_id}
inspect:
models:
healthInfo: HealthInfo
@ -230,13 +269,11 @@ resources:
methods:
health: get /v1/health
version: get /v1/version
embeddings:
models:
create_embeddings_response: OpenAIEmbeddingsResponse
methods:
create: post /v1/embeddings
chat:
models:
chat_completion_chunk: OpenAIChatCompletionChunk
@ -245,12 +282,13 @@ resources:
methods:
create:
type: http
endpoint: post /v1/chat/completions
streaming:
stream_event_model: chat.chat_completion_chunk
param_discriminator: stream
endpoint: post /v1/chat/completions
list:
type: http
paginated: false
endpoint: get /v1/chat/completions
retrieve:
type: http
@ -259,17 +297,15 @@ resources:
methods:
create:
type: http
endpoint: post /v1/completions
streaming:
param_discriminator: stream
endpoint: post /v1/completions
vector_io:
models:
queryChunksResponse: QueryChunksResponse
methods:
insert: post /v1/vector-io/insert
query: post /v1/vector-io/query
vector_stores:
models:
vector_store: VectorStoreObject
@ -278,8 +314,7 @@ resources:
vector_store_search_response: VectorStoreSearchResponsePage
methods:
create: post /v1/vector_stores
list:
endpoint: get /v1/vector_stores
list: get /v1/vector_stores
retrieve: get /v1/vector_stores/{vector_store_id}
update: post /v1/vector_stores/{vector_store_id}
delete: delete /v1/vector_stores/{vector_store_id}
@ -304,15 +339,14 @@ resources:
retrieve: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}
list_files: get /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/files
cancel: post /v1/vector_stores/{vector_store_id}/file_batches/{batch_id}/cancel
models:
models:
model: OpenAIModel
list_models_response: OpenAIListModelsResponse
methods:
list:
endpoint: get /v1/models
paginated: false
endpoint: get /v1/models
retrieve: get /v1/models/{model_id}
register: post /v1/models
unregister: delete /v1/models/{model_id}
@ -320,38 +354,33 @@ resources:
openai:
methods:
list:
endpoint: get /v1/models
paginated: false
endpoint: get /v1/models
providers:
models:
list_providers_response: ListProvidersResponse
methods:
list:
endpoint: get /v1/providers
paginated: false
endpoint: get /v1/providers
retrieve: get /v1/providers/{provider_id}
routes:
models:
list_routes_response: ListRoutesResponse
methods:
list:
endpoint: get /v1/inspect/routes
paginated: false
endpoint: get /v1/inspect/routes
moderations:
models:
create_response: ModerationObject
methods:
create: post /v1/moderations
safety:
models:
run_shield_response: RunShieldResponse
methods:
run_shield: post /v1/safety/run-shield
shields:
models:
shield: Shield
@ -359,45 +388,48 @@ resources:
methods:
retrieve: get /v1/shields/{identifier}
list:
endpoint: get /v1/shields
paginated: false
endpoint: get /v1/shields
register: post /v1/shields
delete: delete /v1/shields/{identifier}
scoring:
methods:
score: post /v1/scoring/score
score_batch: post /v1/scoring/score-batch
scoring_functions:
methods:
retrieve: get /v1/scoring-functions/{scoring_fn_id}
list:
endpoint: get /v1/scoring-functions
paginated: false
register: post /v1/scoring-functions
models:
scoring_fn: ScoringFn
scoring_fn_params: ScoringFnParams
list_scoring_functions_response: ListScoringFunctionsResponse
methods:
retrieve: get /v1/scoring-functions/{scoring_fn_id}
list:
paginated: false
endpoint: get /v1/scoring-functions
register: post /v1/scoring-functions
unregister: delete /v1/scoring-functions/{scoring_fn_id}
files:
models:
file: OpenAIFileObject
list_files_response: ListOpenAIFileResponse
delete_file_response: OpenAIFileDeleteResponse
methods:
create: post /v1/files
list: get /v1/files
retrieve: get /v1/files/{file_id}
delete: delete /v1/files/{file_id}
content: get /v1/files/{file_id}/content
models:
file: OpenAIFileObject
list_files_response: ListOpenAIFileResponse
delete_file_response: OpenAIFileDeleteResponse
batches:
methods:
create: post /v1/batches
list: get /v1/batches
retrieve: get /v1/batches/{batch_id}
cancel: post /v1/batches/{batch_id}/cancel
alpha:
subresources:
inference:
methods:
rerank: post /v1alpha/inference/rerank
post_training:
models:
algorithm_config: AlgorithmConfig
@ -413,38 +445,35 @@ resources:
cancel: post /v1alpha/post-training/job/cancel
status: get /v1alpha/post-training/job/status
list:
paginated: false
endpoint: get /v1alpha/post-training/jobs
paginated: false
benchmarks:
methods:
retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
list:
endpoint: get /v1alpha/eval/benchmarks
paginated: false
register: post /v1alpha/eval/benchmarks
models:
benchmark: Benchmark
list_benchmarks_response: ListBenchmarksResponse
methods:
retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}
list:
paginated: false
endpoint: get /v1alpha/eval/benchmarks
register: post /v1alpha/eval/benchmarks
unregister: delete /v1alpha/eval/benchmarks/{benchmark_id}
eval:
models:
evaluate_response: EvaluateResponse
benchmark_config: BenchmarkConfig
job: Job
methods:
evaluate_rows: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
run_eval: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
evaluate_rows_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/evaluations
run_eval_alpha: post /v1alpha/eval/benchmarks/{benchmark_id}/jobs
subresources:
jobs:
methods:
cancel: delete /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
status: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}
retrieve: get /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result
models:
evaluate_response: EvaluateResponse
benchmark_config: BenchmarkConfig
job: Job
beta:
subresources:
datasets:
@ -454,68 +483,8 @@ resources:
register: post /v1beta/datasets
retrieve: get /v1beta/datasets/{dataset_id}
list:
endpoint: get /v1beta/datasets
paginated: false
endpoint: get /v1beta/datasets
unregister: delete /v1beta/datasets/{dataset_id}
iterrows: get /v1beta/datasetio/iterrows/{dataset_id}
appendrows: post /v1beta/datasetio/append-rows/{dataset_id}
settings:
license: MIT
unwrap_response_fields: [data]
openapi:
transformations:
- command: mergeObject
reason: Better return_type using enum
args:
target:
- "$.components.schemas"
object:
ReturnType:
additionalProperties: false
properties:
type:
enum:
- string
- number
- boolean
- array
- object
- json
- union
- chat_completion_input
- completion_input
- agent_turn_input
required:
- type
type: object
- command: replaceProperties
reason: Replace return type properties with better model (see above)
args:
filter:
only:
- "$.components.schemas.ScoringFn.properties.return_type"
- "$.components.schemas.RegisterScoringFunctionRequest.properties.return_type"
value:
$ref: "#/components/schemas/ReturnType"
- command: oneOfToAnyOf
reason: Prism (mock server) doesn't like one of our requests as it technically matches multiple variants
# `readme` is used to configure the code snippets that will be rendered in the
# README.md of various SDKs. In particular, you can change the `headline`
# snippet's endpoint and the arguments to call it with.
readme:
example_requests:
default:
type: request
endpoint: post /v1/chat/completions
params: &ref_0 {}
headline:
type: request
endpoint: post /v1/models
params: *ref_0
pagination:
type: request
endpoint: post /v1/chat/completions
params: {}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,62 @@
---
title: Deprecated APIs
description: Legacy APIs that are being phased out
sidebar_label: Deprecated
sidebar_position: 1
---
# Deprecated APIs
This section contains APIs that are being phased out in favor of newer, more standardized implementations. These APIs are maintained for backward compatibility but are not recommended for new projects.
:::warning Deprecation Notice
These APIs are deprecated and will be removed in future versions. Please migrate to the recommended alternatives listed below.
:::
## Migration Guide
When using deprecated APIs, please refer to the migration guides provided for each API to understand how to transition to the supported alternatives.
## Deprecated API List
### Legacy Inference APIs
Some older inference endpoints that have been superseded by the standardized Inference API.
**Migration Path:** Use the [Inference API](../api/) instead.
### Legacy Vector Operations
Older vector database operations that have been replaced by the Vector IO API.
**Migration Path:** Use the [Vector IO API](../api/) instead.
### Legacy File Operations
Older file management endpoints that have been replaced by the Files API.
**Migration Path:** Use the [Files API](../api/) instead.
## Support Timeline
Deprecated APIs will be supported according to the following timeline:
- **Current Version**: Full support with deprecation warnings
- **Next Major Version**: Limited support with migration notices
- **Following Major Version**: Removal of deprecated APIs
## Getting Help
If you need assistance migrating from deprecated APIs:
1. Check the specific migration guides for each API
2. Review the [API Reference](../api/) for current alternatives
3. Consult the [Community Forums](https://github.com/llamastack/llama-stack/discussions) for migration support
4. Open an issue on GitHub for specific migration questions
## Contributing
If you find issues with deprecated APIs or have suggestions for improving the migration process, please contribute by:
1. Opening an issue describing the problem
2. Submitting a pull request with improvements
3. Updating migration documentation
For more information on contributing, see our [Contributing Guide](../contributing/).

View file

@ -0,0 +1,128 @@
---
title: Experimental APIs
description: APIs in development with limited support
sidebar_label: Experimental
sidebar_position: 1
---
# Experimental APIs
This section contains APIs that are currently in development and may have limited support or stability. These APIs are available for testing and feedback but should not be used in production environments.
:::warning Experimental Notice
These APIs are experimental and may change without notice. Use with caution and provide feedback to help improve them.
:::
## Current Experimental APIs
### Batch Inference API
Run inference on a dataset of inputs in batch mode for improved efficiency.
**Status:** In Development
**Provider Support:** Limited
**Use Case:** Large-scale inference operations
**Features:**
- Batch processing of multiple inputs
- Optimized resource utilization
- Progress tracking and monitoring
### Batch Agents API
Run agentic workflows on a dataset of inputs in batch mode.
**Status:** In Development
**Provider Support:** Limited
**Use Case:** Large-scale agent operations
**Features:**
- Batch agent execution
- Parallel processing capabilities
- Result aggregation and analysis
### Synthetic Data Generation API
Generate synthetic data for model development and testing.
**Status:** Early Development
**Provider Support:** Very Limited
**Use Case:** Training data augmentation
**Features:**
- Automated data generation
- Quality control mechanisms
- Customizable generation parameters
### Batches API (OpenAI-compatible)
OpenAI-compatible batch management for inference operations.
**Status:** In Development
**Provider Support:** Limited
**Use Case:** OpenAI batch processing compatibility
**Features:**
- OpenAI batch API compatibility
- Job scheduling and management
- Status tracking and monitoring
## Getting Started with Experimental APIs
### Prerequisites
- Llama Stack server running with experimental features enabled
- Appropriate provider configurations
- Understanding of API limitations
### Configuration
Experimental APIs may require special configuration flags or provider settings. Check the specific API documentation for setup requirements.
### Usage Guidelines
1. **Testing Only**: Use experimental APIs for testing and development only
2. **Monitor Changes**: Watch for updates and breaking changes
3. **Provide Feedback**: Report issues and suggest improvements
4. **Backup Data**: Always backup important data when using experimental features
## Feedback and Contribution
We encourage feedback on experimental APIs to help improve them:
### Reporting Issues
- Use GitHub issues with the "experimental" label
- Include detailed error messages and reproduction steps
- Specify the API version and provider being used
### Feature Requests
- Submit feature requests through GitHub discussions
- Provide use cases and expected behavior
- Consider contributing implementations
### Testing
- Test experimental APIs in your environment
- Report performance issues and optimization opportunities
- Share success stories and use cases
## Migration to Stable APIs
As experimental APIs mature, they will be moved to the stable API section. When this happens:
1. **Announcement**: We'll announce the promotion in release notes
2. **Migration Guide**: Detailed migration instructions will be provided
3. **Deprecation Timeline**: Experimental versions will be deprecated with notice
4. **Support**: Full support will be available for stable versions
## Provider Support
Experimental APIs may have limited provider support. Check the specific API documentation for:
- Supported providers
- Configuration requirements
- Known limitations
- Performance characteristics
## Roadmap
Experimental APIs are part of our ongoing development roadmap:
- **Q1 2024**: Batch Inference API stabilization
- **Q2 2024**: Batch Agents API improvements
- **Q3 2024**: Synthetic Data Generation API expansion
- **Q4 2024**: Batches API full OpenAI compatibility
For the latest updates, follow our [GitHub releases](https://github.com/llamastack/llama-stack/releases) and [roadmap discussions](https://github.com/llamastack/llama-stack/discussions).

View file

@ -0,0 +1,287 @@
---
title: OpenAI API Compatibility
description: OpenAI-compatible APIs and features in Llama Stack
sidebar_label: OpenAI Compatibility
sidebar_position: 1
---
# OpenAI API Compatibility
Llama Stack provides comprehensive OpenAI API compatibility, allowing you to use existing OpenAI API clients and tools with Llama Stack providers. This compatibility layer ensures seamless migration and interoperability.
## Overview
OpenAI API compatibility in Llama Stack includes:
- **OpenAI-compatible endpoints** for all major APIs
- **Request/response format compatibility** with OpenAI standards
- **Authentication and authorization** using OpenAI-style API keys
- **Error handling** with OpenAI-compatible error codes and messages
- **Rate limiting** and usage tracking compatible with OpenAI patterns
## Supported OpenAI APIs
### Chat Completions API
OpenAI-compatible chat completions for conversational AI applications.
**Endpoint:** `/v1/chat/completions`
**Compatibility:** Full OpenAI API compatibility
**Providers:** All inference providers
**Features:**
- Message-based conversations
- System prompts and user messages
- Function calling support
- Streaming responses
- Temperature and other parameter controls
### Completions API
OpenAI-compatible text completions for general text generation.
**Endpoint:** `/v1/completions`
**Compatibility:** Full OpenAI API compatibility
**Providers:** All inference providers
**Features:**
- Text completion generation
- Prompt engineering support
- Customizable parameters
- Batch processing capabilities
### Embeddings API
OpenAI-compatible embeddings for vector operations.
**Endpoint:** `/v1/embeddings`
**Compatibility:** Full OpenAI API compatibility
**Providers:** All embedding providers
**Features:**
- Text embedding generation
- Multiple embedding models
- Batch embedding processing
- Vector similarity operations
### Files API
OpenAI-compatible file management for document processing.
**Endpoint:** `/v1/files`
**Compatibility:** Full OpenAI API compatibility
**Providers:** Local Filesystem, S3
**Features:**
- File upload and management
- Document processing
- File metadata tracking
- Secure file access
### Vector Store Files API
OpenAI-compatible vector store file operations for RAG applications.
**Endpoint:** `/v1/vector_stores/{vector_store_id}/files`
**Compatibility:** Full OpenAI API compatibility
**Providers:** FAISS, SQLite-vec, Milvus, ChromaDB, Qdrant, Weaviate, Postgres (PGVector)
**Features:**
- Automatic document processing
- Vector store integration
- File chunking and indexing
- Search and retrieval operations
### Batches API
OpenAI-compatible batch processing for large-scale operations.
**Endpoint:** `/v1/batches`
**Compatibility:** OpenAI API compatibility (experimental)
**Providers:** Limited support
**Features:**
- Batch job creation and management
- Progress tracking
- Result retrieval
- Error handling
## Migration from OpenAI
### Step 1: Update API Endpoint
Change your API endpoint from OpenAI to your Llama Stack server:
```python
# Before (OpenAI)
import openai
client = openai.OpenAI(api_key="your-openai-key")
# After (Llama Stack)
import openai
client = openai.OpenAI(
api_key="your-llama-stack-key",
base_url="http://localhost:8000/v1" # Your Llama Stack server
)
```
### Step 2: Configure Providers
Set up your preferred providers in the Llama Stack configuration:
```yaml
# stack-config.yaml
inference:
providers:
- name: "meta-reference"
type: "inline"
model: "llama-3.1-8b"
```
### Step 3: Test Compatibility
Verify that your existing code works with Llama Stack:
```python
# Test chat completions
response = client.chat.completions.create(
model="llama-3.1-8b",
messages=[
{"role": "user", "content": "Hello, world!"}
]
)
print(response.choices[0].message.content)
```
## Provider-Specific Features
### Meta Reference Provider
- Full OpenAI API compatibility
- Local model execution
- Custom model support
### Remote Providers
- OpenAI API compatibility
- Cloud-based execution
- Scalable infrastructure
### Vector Store Providers
- OpenAI vector store API compatibility
- Automatic document processing
- Advanced search capabilities
## Authentication
Llama Stack supports OpenAI-style authentication:
### API Key Authentication
```python
client = openai.OpenAI(
api_key="your-api-key",
base_url="http://localhost:8000/v1"
)
```
### Environment Variables
```bash
export OPENAI_API_KEY="your-api-key"
export OPENAI_BASE_URL="http://localhost:8000/v1"
```
## Error Handling
Llama Stack provides OpenAI-compatible error responses:
```python
try:
response = client.chat.completions.create(...)
except openai.APIError as e:
print(f"API Error: {e}")
except openai.RateLimitError as e:
print(f"Rate Limit Error: {e}")
except openai.APIConnectionError as e:
print(f"Connection Error: {e}")
```
## Rate Limiting
OpenAI-compatible rate limiting is supported:
- **Requests per minute** limits
- **Tokens per minute** limits
- **Concurrent request** limits
- **Usage tracking** and monitoring
## Monitoring and Observability
Track your API usage with OpenAI-compatible monitoring:
- **Request/response logging**
- **Usage metrics** and analytics
- **Performance monitoring**
- **Error tracking** and alerting
## Best Practices
### 1. Provider Selection
Choose providers based on your requirements:
- **Local development**: Meta Reference, Ollama
- **Production**: Cloud providers (Fireworks, Together, NVIDIA)
- **Specialized use cases**: Custom providers
### 2. Model Configuration
Configure models for optimal performance:
- **Model selection** based on task requirements
- **Parameter tuning** for specific use cases
- **Resource allocation** for performance
### 3. Error Handling
Implement robust error handling:
- **Retry logic** for transient failures
- **Fallback providers** for high availability
- **Monitoring** and alerting for issues
### 4. Security
Follow security best practices:
- **API key management** and rotation
- **Access control** and authorization
- **Data privacy** and compliance
## Implementation Examples
For detailed code examples and implementation guides, see our [OpenAI Implementation Guide](../providers/openai.mdx).
## Known Limitations
### Responses API Limitations
The Responses API is still in active development. For detailed information about current limitations and implementation status, see our [OpenAI Responses API Limitations](../providers/openai_responses_limitations.mdx).
## Troubleshooting
### Common Issues
**Connection Errors**
- Verify server is running
- Check network connectivity
- Validate API endpoint URL
**Authentication Errors**
- Verify API key is correct
- Check key permissions
- Ensure proper authentication headers
**Model Errors**
- Verify model is available
- Check provider configuration
- Validate model parameters
### Getting Help
For OpenAI compatibility issues:
1. **Check Documentation**: Review provider-specific documentation
2. **Community Support**: Ask questions in GitHub discussions
3. **Issue Reporting**: Open GitHub issues for bugs
4. **Professional Support**: Contact support for enterprise issues
## Roadmap
Upcoming OpenAI compatibility features:
- **Enhanced batch processing** support
- **Advanced function calling** capabilities
- **Improved error handling** and diagnostics
- **Performance optimizations** for large-scale deployments
For the latest updates, follow our [GitHub releases](https://github.com/llamastack/llama-stack/releases) and [roadmap discussions](https://github.com/llamastack/llama-stack/discussions).

144
docs/docs/api/index.mdx Normal file
View file

@ -0,0 +1,144 @@
---
title: API Reference
description: Complete reference for Llama Stack APIs
sidebar_label: Overview
sidebar_position: 1
---
# API Reference
Llama Stack provides a comprehensive set of APIs for building generative AI applications. All APIs follow OpenAI-compatible standards and can be used interchangeably across different providers.
## Core APIs
### Inference API
Run inference with Large Language Models (LLMs) and embedding models.
**Supported Providers:**
- Meta Reference (Single Node)
- Ollama (Single Node)
- Fireworks (Hosted)
- Together (Hosted)
- NVIDIA NIM (Hosted and Single Node)
- vLLM (Hosted and Single Node)
- TGI (Hosted and Single Node)
- AWS Bedrock (Hosted)
- Cerebras (Hosted)
- Groq (Hosted)
- SambaNova (Hosted)
- PyTorch ExecuTorch (On-device iOS, Android)
- OpenAI (Hosted)
- Anthropic (Hosted)
- Gemini (Hosted)
- WatsonX (Hosted)
### Agents API
Run multi-step agentic workflows with LLMs, including tool usage, memory (RAG), and complex reasoning.
**Supported Providers:**
- Meta Reference (Single Node)
- Fireworks (Hosted)
- Together (Hosted)
- PyTorch ExecuTorch (On-device iOS)
### Vector IO API
Perform operations on vector stores, including adding documents, searching, and deleting documents.
**Supported Providers:**
- FAISS (Single Node)
- SQLite-Vec (Single Node)
- Chroma (Hosted and Single Node)
- Milvus (Hosted and Single Node)
- Postgres (PGVector) (Hosted and Single Node)
- Weaviate (Hosted)
- Qdrant (Hosted and Single Node)
### Files API (OpenAI-compatible)
Manage file uploads, storage, and retrieval with OpenAI-compatible endpoints.
**Supported Providers:**
- Local Filesystem (Single Node)
- S3 (Hosted)
### Vector Store Files API (OpenAI-compatible)
Integrate file operations with vector stores for automatic document processing and search.
**Supported Providers:**
- FAISS (Single Node)
- SQLite-vec (Single Node)
- Milvus (Single Node)
- ChromaDB (Hosted and Single Node)
- Qdrant (Hosted and Single Node)
- Weaviate (Hosted)
- Postgres (PGVector) (Hosted and Single Node)
### Safety API
Apply safety policies to outputs at a systems level, not just model level.
**Supported Providers:**
- Llama Guard (Depends on Inference Provider)
- Prompt Guard (Single Node)
- Code Scanner (Single Node)
- AWS Bedrock (Hosted)
### Post Training API
Fine-tune models for specific use cases and domains.
**Supported Providers:**
- Meta Reference (Single Node)
- HuggingFace (Single Node)
- TorchTune (Single Node)
- NVIDIA NEMO (Hosted)
### Eval API
Generate outputs and perform scoring to evaluate system performance.
**Supported Providers:**
- Meta Reference (Single Node)
- NVIDIA NEMO (Hosted)
### Telemetry API
Collect telemetry data from the system for monitoring and observability.
**Supported Providers:**
- Meta Reference (Single Node)
### Tool Runtime API
Interact with various tools and protocols to extend LLM capabilities.
**Supported Providers:**
- Brave Search (Hosted)
- RAG Runtime (Single Node)
## API Compatibility
All Llama Stack APIs are designed to be OpenAI-compatible, allowing you to:
- Use existing OpenAI API clients and tools
- Migrate from OpenAI to other providers seamlessly
- Maintain consistent API contracts across different environments
## Getting Started
To get started with Llama Stack APIs:
1. **Choose a Distribution**: Select a pre-configured distribution that matches your environment
2. **Configure Providers**: Set up the providers you want to use for each API
3. **Start the Server**: Launch the Llama Stack server with your configuration
4. **Use the APIs**: Make requests to the API endpoints using your preferred client
For detailed setup instructions, see our [Getting Started Guide](../getting_started/quickstart).
## Provider Details
For complete provider compatibility and setup instructions, see our [Providers Documentation](../providers/).
## API Stability
Llama Stack APIs are organized by stability level:
- **[Stable APIs](./index.mdx)** - Production-ready APIs with full support
- **[Experimental APIs](../api-experimental/)** - APIs in development with limited support
- **[Deprecated APIs](../api-deprecated/)** - Legacy APIs being phased out
## OpenAI Integration
For specific OpenAI API compatibility features, see our [OpenAI Compatibility Guide](../api-openai/).

View file

@ -0,0 +1,87 @@
---
title: Admin UI & Chat Playground
description: Web-based admin interface and chat playground for Llama Stack
sidebar_label: Playground
sidebar_position: 10
---
# Admin UI & Chat Playground
The Llama Stack UI provides a comprehensive web-based admin interface for managing your Llama Stack server, with an integrated chat playground for interactive testing. This admin interface is the primary way to monitor, manage, and debug your Llama Stack applications.
## Quick Start
Launch the admin UI with:
```bash
npx llama-stack-ui
```
Then visit `http://localhost:8322` to access the interface.
## Admin Interface Features
The Llama Stack UI is organized into three main sections:
### 🎯 Create
**Chat Playground** - Interactive testing environment
- Real-time chat interface for testing agents and models
- Multi-turn conversations with tool calling support
- Agent SDK integration (will be migrated to Responses API)
- Custom system prompts and model parameter adjustment
### 📊 Manage
**Logs & Resource Management** - Monitor and manage your stack
- **Responses Logs**: View and analyze agent responses and interactions
- **Chat Completions Logs**: Monitor chat completion requests and responses
- **Vector Stores**: Create, manage, and monitor vector databases for RAG workflows
- **Prompts**: Full CRUD operations for prompt templates and management
- **Files**: Forthcoming file management capabilities
## Key Capabilities for Application Development
### Real-time Monitoring
- **Response Tracking**: Monitor all agent responses and tool calls
- **Completion Analysis**: View chat completion performance and patterns
- **Vector Store Activity**: Track RAG operations and document processing
- **Prompt Usage**: Analyze prompt template performance
### Resource Management
- **Vector Store CRUD**: Create, update, and delete vector databases
- **Prompt Library**: Organize and version control your prompts
- **File Operations**: Manage documents and assets (forthcoming)
### Interactive Testing
- **Chat Playground**: Test conversational flows before production deployment
- **Agent Prototyping**: Validate agent behaviors and tool integrations
## Development Workflow Integration
The admin UI supports your development lifecycle:
1. **Development**: Use chat playground to prototype and test features
2. **Monitoring**: Track system performance through logs and metrics
3. **Management**: Organize prompts, vector stores, and other resources
4. **Debugging**: Analyze logs to identify and resolve issues
## Architecture Notes
- **Current**: Chat playground uses Agents SDK
- **Future**: Migration to Responses API for improved performance and consistency
- **Admin Focus**: Primary emphasis on monitoring, logging, and resource management
## Getting Started
1. **Launch the UI**: Run `npx llama-stack-ui`
2. **Explore Logs**: Start with Responses and Chat Completions logs to understand your system activity
3. **Test in Playground**: Use the chat interface to validate your agent configurations
4. **Manage Resources**: Create vector stores and organize prompts through the UI
For detailed setup and configuration, see the [Llama Stack UI documentation](/docs/distributions/llama_stack_ui).
## Next Steps
- Set up your [first agent](/docs/building_applications/agent)
- Implement [RAG functionality](/docs/building_applications/rag)
- Add [evaluation metrics](/docs/building_applications/evals)
- Configure [safety measures](/docs/building_applications/safety)

View file

@ -104,23 +104,19 @@ client.toolgroups.register(
)
```
Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide authorization headers to send to the MCP server using the "Provider Data" abstraction provided by Llama Stack. When making an agent call,
Note that most of the more useful MCP servers need you to authenticate with them. Many of them use OAuth2.0 for authentication. You can provide the authorization token when creating the Agent:
```python
agent = Agent(
...,
tools=["mcp::deepwiki"],
extra_headers={
"X-LlamaStack-Provider-Data": json.dumps(
tools=[
{
"mcp_headers": {
"http://mcp.deepwiki.com/sse": {
"Authorization": "Bearer <your_access_token>",
},
},
"type": "mcp",
"server_url": "https://mcp.deepwiki.com/sse",
"server_label": "mcp::deepwiki",
"authorization": "<your_access_token>", # OAuth token (without "Bearer " prefix)
}
),
},
],
)
agent.create_turn(...)
```

View file

@ -58,7 +58,7 @@ External APIs must expose a `available_providers()` function in their module tha
```python
# llama_stack_api_weather/api.py
from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec
from llama_stack_api import Api, InlineProviderSpec, ProviderSpec
def available_providers() -> list[ProviderSpec]:
@ -79,7 +79,7 @@ A Protocol class like so:
# llama_stack_api_weather/api.py
from typing import Protocol
from llama_stack.schema_utils import webmethod
from llama_stack_api import webmethod
class WeatherAPI(Protocol):
@ -151,13 +151,12 @@ __all__ = ["WeatherAPI", "available_providers"]
# llama-stack-api-weather/src/llama_stack_api_weather/weather.py
from typing import Protocol
from llama_stack.providers.datatypes import (
from llama_stack_api import (
Api,
ProviderSpec,
RemoteProviderSpec,
webmethod,
)
from llama_stack.schema_utils import webmethod
def available_providers() -> list[ProviderSpec]:
return [

View file

@ -7,7 +7,7 @@ sidebar_position: 1
# APIs
A Llama Stack API is described as a collection of REST endpoints. We currently support the following APIs:
A Llama Stack API is described as a collection of REST endpoints following OpenAI API standards. We currently support the following APIs:
- **Inference**: run inference with a LLM
- **Safety**: apply safety policies to the output at a Systems (not only model) level
@ -16,11 +16,26 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
- **Scoring**: evaluate outputs of the system
- **Eval**: generate outputs (via Inference or Agents) and perform scoring
- **VectorIO**: perform operations on vector stores, such as adding documents, searching, and deleting documents
- **Files**: manage file uploads, storage, and retrieval
- **Telemetry**: collect telemetry data from the system
- **Post Training**: fine-tune a model
- **Tool Runtime**: interact with various tools and protocols
- **Responses**: generate responses from an LLM using this OpenAI compatible API.
- **Responses**: generate responses from an LLM
We are working on adding a few more APIs to complete the application lifecycle. These will include:
- **Batch Inference**: run inference on a dataset of inputs
- **Batch Agents**: run agents on a dataset of inputs
- **Batches**: OpenAI-compatible batch management for inference
## OpenAI API Compatibility
We are working on adding OpenAI API compatibility to Llama Stack. This will allow you to use Llama Stack with OpenAI API clients and tools.
### File Operations and Vector Store Integration
The Files API and Vector Store APIs work together through file operations, enabling automatic document processing and search. This integration implements the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files) and allows you to:
- Upload documents through the Files API
- Automatically process and chunk documents into searchable vectors
- Store processed content in vector databases based on the availability of [our providers](../../providers/index.mdx)
- Search through documents using natural language queries
For detailed information about this integration, see [File Operations and Vector Store Integration](../file_operations_vector_stores.md).

View file

@ -0,0 +1,420 @@
# File Operations and Vector Store Integration
## Overview
Llama Stack provides seamless integration between the Files API and Vector Store APIs, enabling you to upload documents and automatically process them into searchable vector embeddings. This integration implements file operations following the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files).
## Enhanced Capabilities Beyond OpenAI
While Llama Stack maintains full compatibility with OpenAI's Vector Store API, it provides several additional capabilities that enhance functionality and flexibility:
### **Embedding Model Specification**
Unlike OpenAI's vector stores which use a fixed embedding model, Llama Stack allows you to specify which embedding model to use when creating a vector store:
```python
# Create vector store with specific embedding model
vector_store = client.vector_stores.create(
name="my_documents",
embedding_model="all-MiniLM-L6-v2", # Specify your preferred model
embedding_dimension=384,
)
```
### **Advanced Search Modes**
Llama Stack supports multiple search modes beyond basic vector similarity:
- **Vector Search**: Pure semantic similarity search using embeddings
- **Keyword Search**: Traditional keyword-based search for exact matches
- **Hybrid Search**: Combines both vector and keyword search for optimal results
```python
# Different search modes
results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="machine learning algorithms",
search_mode="hybrid", # or "vector", "keyword"
max_num_results=5,
)
```
### **Flexible Ranking Options**
For hybrid search, Llama Stack offers configurable ranking strategies:
- **RRF (Reciprocal Rank Fusion)**: Combines rankings with configurable impact factor
- **Weighted Ranker**: Linear combination of vector and keyword scores with adjustable weights
```python
# Custom ranking configuration
results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="neural networks",
search_mode="hybrid",
ranking_options={
"ranker": {"type": "weighted", "alpha": 0.7} # 70% vector, 30% keyword
},
)
```
### **Provider Selection**
Choose from multiple vector store providers based on your specific needs:
- **Inline Providers**: FAISS (fast in-memory), SQLite-vec (disk-based), Milvus (high-performance)
- **Remote Providers**: ChromaDB, Qdrant, Weaviate, Postgres (PGVector), Milvus
```python
# Specify provider when creating vector store
vector_store = client.vector_stores.create(
name="my_documents", provider_id="sqlite-vec" # Choose your preferred provider
)
```
## How It Works
The file operations work through several key components:
1. **File Upload**: Documents are uploaded through the Files API
2. **Automatic Processing**: Files are automatically chunked and converted to embeddings
3. **Vector Storage**: Chunks are stored in vector databases with metadata
4. **Search & Retrieval**: Users can search through processed documents using natural language
## Supported Vector Store Providers
The following vector store providers support file operations:
### Inline Providers (Single Node)
- **FAISS**: Fast in-memory vector similarity search
- **SQLite-vec**: Disk-based storage with hybrid search capabilities
### Remote Providers (Hosted)
- **ChromaDB**: Vector database with metadata filtering
- **Weaviate**: Vector database with GraphQL interface
- **Postgres (PGVector)**: Vector extensions for PostgreSQL
### Both Inline & Remote Providers
- **Milvus**: High-performance vector database with advanced indexing
- **Qdrant**: Vector similarity search with payload filtering
## File Processing Pipeline
### 1. File Upload
```python
from llama_stack import LlamaStackClient
client = LlamaStackClient("http://localhost:8000")
# Upload a document
with open("document.pdf", "rb") as f:
file_info = await client.files.upload(file=f, purpose="assistants")
```
### 2. Attach to Vector Store
```python
# Create a vector store
vector_store = client.vector_stores.create(name="my_documents")
# Attach the file to the vector store
file_attach_response = await client.vector_stores.files.create(
vector_store_id=vector_store.id, file_id=file_info.id
)
```
### 3. Automatic Processing
The system automatically:
- Detects the file type and extracts text content
- Splits content into chunks (default: 800 tokens with 400 token overlap)
- Generates embeddings for each chunk
- Stores chunks with metadata in the vector store
- Updates file status to "completed"
### 4. Search and Retrieval
```python
# Search through processed documents
search_results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="What is the main topic discussed?",
max_num_results=5,
)
# Process results
for result in search_results.data:
print(f"Score: {result.score}")
for content in result.content:
print(f"Content: {content.text}")
```
## Supported File Types
The FileResponse system supports various document formats:
- **Text Files**: `.txt`, `.md`, `.rst`
- **Documents**: `.pdf`, `.docx`, `.doc`
- **Code**: `.py`, `.js`, `.java`, `.cpp`, etc.
- **Data**: `.json`, `.csv`, `.xml`
- **Web Content**: HTML files
## Chunking Strategies
### Default Strategy
The default chunking strategy uses:
- **Max Chunk Size**: 800 tokens
- **Overlap**: 400 tokens
- **Method**: Semantic boundary detection
### Custom Chunking
You can customize chunking when attaching files:
```python
from llama_stack.apis.vector_io import VectorStoreChunkingStrategy
# Attach file with custom chunking
file_attach_response = await client.vector_stores.files.create(
vector_store_id=vector_store.id,
file_id=file_info.id,
chunking_strategy=chunking_strategy,
)
```
**Note**: While Llama Stack is OpenAI-compatible, it also supports additional options beyond the standard OpenAI API. When creating vector stores, you can specify custom embedding models and embedding dimensions that will be used when processing chunks from attached files.
## File Management
### List Files in Vector Store
```python
# List all files in a vector store
files = await client.vector_stores.files.list(vector_store_id=vector_store.id)
for file in files:
print(f"File: {file.filename}, Status: {file.status}")
```
### File Status Tracking
Files go through several statuses:
- **in_progress**: File is being processed
- **completed**: File successfully processed and searchable
- **failed**: Processing failed (check `last_error` for details)
- **cancelled**: Processing was cancelled
### Retrieve File Content
```python
# Get chunked content from vector store
content_response = await client.vector_stores.files.retrieve_content(
vector_store_id=vector_store.id, file_id=file_info.id
)
for chunk in content_response.content:
print(f"Chunk {chunk.metadata.get('chunk_index', 0)}: {chunk.text}")
```
## Vector Store Management
### List Vector Stores
Retrieve a paginated list of all vector stores:
```python
# List all vector stores with default pagination
vector_stores = await client.vector_stores.list()
# Custom pagination and ordering
vector_stores = await client.vector_stores.list(
limit=10,
order="asc", # or "desc"
after="vs_12345678", # cursor-based pagination
)
for store in vector_stores.data:
print(f"Store: {store.name}, Files: {store.file_counts.total}")
print(f"Created: {store.created_at}, Status: {store.status}")
```
### Retrieve Vector Store Details
Get detailed information about a specific vector store:
```python
# Get vector store details
store_details = await client.vector_stores.retrieve(vector_store_id="vs_12345678")
print(f"Name: {store_details.name}")
print(f"Status: {store_details.status}")
print(f"File Counts: {store_details.file_counts}")
print(f"Usage: {store_details.usage_bytes} bytes")
print(f"Created: {store_details.created_at}")
print(f"Metadata: {store_details.metadata}")
```
### Update Vector Store
Modify vector store properties such as name, metadata, or expiration settings:
```python
# Update vector store name and metadata
updated_store = await client.vector_stores.update(
vector_store_id="vs_12345678",
name="Updated Document Collection",
metadata={
"description": "Updated collection for research",
"category": "research",
"version": "2.0",
},
)
# Set expiration policy
expired_store = await client.vector_stores.update(
vector_store_id="vs_12345678",
expires_after={"anchor": "last_active_at", "days": 30},
)
print(f"Updated store: {updated_store.name}")
print(f"Last active: {updated_store.last_active_at}")
```
### Delete Vector Store
Remove a vector store and all its associated data:
```python
# Delete a vector store
delete_response = await client.vector_stores.delete(vector_store_id="vs_12345678")
if delete_response.deleted:
print(f"Vector store {delete_response.id} successfully deleted")
else:
print("Failed to delete vector store")
```
**Important Notes:**
- Deleting a vector store removes all files, chunks, and embeddings
- This operation cannot be undone
- The underlying vector database is also cleaned up
- Consider backing up important data before deletion
## Search Capabilities
### Vector Search
Pure similarity search using embeddings:
```python
results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="machine learning algorithms",
max_num_results=10,
)
```
### Filtered Search
Combine vector search with metadata filtering:
```python
results = await client.vector_stores.search(
vector_store_id=vector_store.id,
query="machine learning algorithms",
filters={"file_type": "pdf", "upload_date": "2024-01-01"},
max_num_results=10,
)
```
### Hybrid Search
[SQLite-vec](../providers/vector_io/inline_sqlite-vec.mdx), [pgvector](../providers/vector_io/remote_pgvector.mdx), and [Milvus](../providers/vector_io/inline_milvus.mdx) support combining vector and keyword search.
## Performance Considerations
> **Note**: For detailed performance optimization strategies, see [Performance Considerations](../providers/files/openai_file_operations_support.md#performance-considerations) in the provider documentation.
**Key Points:**
- **Chunk Size**: 400-600 tokens for precision, 800-1200 for context
- **Storage**: Choose provider based on your performance needs
- **Search**: Optimize for your specific use case
## Error Handling
> **Note**: For comprehensive troubleshooting and error handling, see [Troubleshooting](../providers/files/openai_file_operations_support.md#troubleshooting) in the provider documentation.
**Common Issues:**
- File processing failures (format, size limits)
- Search performance optimization
- Storage and memory issues
## Best Practices
> **Note**: For detailed best practices and recommendations, see [Best Practices](../providers/files/openai_file_operations_support.md#best-practices) in the provider documentation.
**Key Recommendations:**
- File organization and naming conventions
- Chunking strategy optimization
- Metadata and monitoring practices
- Regular cleanup and maintenance
## Integration Examples
### RAG Application
```python
# Build a RAG system with file uploads
async def build_rag_system():
# Create vector store
vector_store = client.vector_stores.create(name="knowledge_base")
# Upload and process documents
documents = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
for doc in documents:
with open(doc, "rb") as f:
file_info = await client.files.create(file=f, purpose="assistants")
await client.vector_stores.files.create(
vector_store_id=vector_store.id, file_id=file_info.id
)
return vector_store
# Query the RAG system
async def query_rag(vector_store_id, question):
results = await client.vector_stores.search(
vector_store_id=vector_store_id, query=question, max_num_results=5
)
return results
```
### Document Analysis
```python
# Analyze document content through vector search
async def analyze_document(vector_store_id, file_id):
# Get document content
content = await client.vector_stores.files.retrieve_content(
vector_store_id=vector_store_id, file_id=file_id
)
# Search for specific topics
topics = ["introduction", "methodology", "conclusion"]
analysis = {}
for topic in topics:
results = await client.vector_stores.search(
vector_store_id=vector_store_id, query=topic, max_num_results=3
)
analysis[topic] = results.data
return analysis
```
## Next Steps
- Explore the [Files API documentation](../../providers/files/files.mdx) for detailed API reference
- Check [Vector Store Providers](../providers/vector_io/index.mdx) for specific implementation details
- Review [Getting Started](../getting_started/quickstart.mdx) for quick setup instructions

View file

@ -65,7 +65,7 @@ external_providers_dir: /workspace/providers.d
Inside `providers.d/custom_ollama/provider.py`, define `get_provider_spec()` so the CLI can discover dependencies:
```python
from llama_stack.providers.datatypes import ProviderSpec
from llama_stack_api.providers.datatypes import ProviderSpec
def get_provider_spec() -> ProviderSpec:

View file

@ -221,7 +221,15 @@ models:
```
A Model is an instance of a "Resource" (see [Concepts](../concepts/)) and is associated with a specific inference provider (in this case, the provider with identifier `ollama`). This is an instance of a "pre-registered" model. While we always encourage the clients to register models before using them, some Stack servers may come up a list of "already known and available" models.
What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. Contrast it with `model_id` which is the identifier for the same model for Llama Stack's purposes. For example, you may want to name "llama3.2:vision-11b" as "image_captioning_model" when you use it in your Stack interactions. When omitted, the server will set `provider_model_id` to be the same as `model_id`.
What's with the `provider_model_id` field? This is an identifier for the model inside the provider's model catalog. The `model_id` field is provided for configuration purposes but is not used as part of the model identifier.
**Important:** Models are identified as `provider_id/provider_model_id` in the system and when making API calls. When `provider_model_id` is omitted, the server will set it to be the same as `model_id`.
Examples:
- Config: `model_id: llama3.2`, `provider_id: ollama`, `provider_model_id: null`
→ Access as: `ollama/llama3.2`
- Config: `model_id: my-llama`, `provider_id: vllm-inference`, `provider_model_id: llama-3-2-3b`
→ Access as: `vllm-inference/llama-3-2-3b` (the `model_id` is not used in the identifier)
If you need to conditionally register a model in the configuration, such as only when specific environment variable(s) are set, this can be accomplished by utilizing a special `__disabled__` string as the default value of an environment variable substitution, as shown below:

View file

@ -19,3 +19,4 @@ This section provides an overview of the distributions available in Llama Stack.
- **[Starting Llama Stack Server](./starting_llama_stack_server.mdx)** - How to run distributions
- **[Importing as Library](./importing_as_library.mdx)** - Use distributions in your code
- **[Configuration Reference](./configuration.mdx)** - Configuration file format details
- **[Llama Stack UI](./llama_stack_ui.mdx)** - Web-based user interface for interacting with Llama Stack servers

View file

@ -0,0 +1,109 @@
---
title: Llama Stack UI
description: Web-based user interface for interacting with Llama Stack servers
sidebar_label: Llama Stack UI
sidebar_position: 8
---
# Llama Stack UI
The Llama Stack UI is a web-based interface for interacting with Llama Stack servers. Built with Next.js and React, it provides a visual way to work with agents, manage resources, and view logs.
## Features
- **Logs & Monitoring**: View chat completions, agent responses, and vector store activity
- **Vector Stores**: Create and manage vector databases for RAG (Retrieval-Augmented Generation) workflows
- **Prompt Management**: Create and manage reusable prompts
## Prerequisites
You need a running Llama Stack server. The UI is a client that connects to the Llama Stack backend.
If you don't have a Llama Stack server running yet, see the [Starting Llama Stack Server](../getting_started/starting_llama_stack_server.mdx) guide.
## Running the UI
### Option 1: Using npx (Recommended for Quick Start)
The fastest way to get started is using `npx`:
```bash
npx llama-stack-ui
```
This will start the UI server on `http://localhost:8322` (default port).
### Option 2: Using Docker
Run the UI in a container:
```bash
docker run -p 8322:8322 llamastack/ui
```
Access the UI at `http://localhost:8322`.
## Environment Variables
The UI can be configured using the following environment variables:
| Variable | Description | Default |
|----------|-------------|---------|
| `LLAMA_STACK_BACKEND_URL` | URL of your Llama Stack server | `http://localhost:8321` |
| `LLAMA_STACK_UI_PORT` | Port for the UI server | `8322` |
If the Llama Stack server is running with authentication enabled, you can configure the UI to use it by setting the following environment variables:
| Variable | Description | Default |
|----------|-------------|---------|
| `NEXTAUTH_URL` | NextAuth URL for authentication | `http://localhost:8322` |
| `GITHUB_CLIENT_ID` | GitHub OAuth client ID (optional, for authentication) | - |
| `GITHUB_CLIENT_SECRET` | GitHub OAuth client secret (optional, for authentication) | - |
### Setting Environment Variables
#### For npx:
```bash
LLAMA_STACK_BACKEND_URL=http://localhost:8321 \
LLAMA_STACK_UI_PORT=8080 \
npx llama-stack-ui
```
#### For Docker:
```bash
docker run -p 8080:8080 \
-e LLAMA_STACK_BACKEND_URL=http://localhost:8321 \
-e LLAMA_STACK_UI_PORT=8080 \
llamastack/ui
```
## Using the UI
### Managing Resources
- **Vector Stores**: Create vector databases for RAG workflows, view stored documents and embeddings
- **Prompts**: Create and manage reusable prompt templates
- **Chat Completions**: View history of chat interactions
- **Responses**: Browse detailed agent responses and tool calls
## Development
If you want to run the UI from source for development:
```bash
# From the project root
cd src/llama_stack_ui
# Install dependencies
npm install
# Set environment variables
export LLAMA_STACK_BACKEND_URL=http://localhost:8321
# Start the development server
npm run dev
```
The development server will start on `http://localhost:8322` with hot reloading enabled.

View file

@ -0,0 +1,143 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# OCI Distribution
The `llamastack/distribution-oci` distribution consists of the following provider configurations.
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| files | `inline::localfs` |
| inference | `remote::oci` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
### Environment Variables
The following environment variables can be configured:
- `OCI_AUTH_TYPE`: OCI authentication type (instance_principal or config_file) (default: `instance_principal`)
- `OCI_REGION`: OCI region (e.g., us-ashburn-1, us-chicago-1, us-phoenix-1, eu-frankfurt-1) (default: ``)
- `OCI_COMPARTMENT_OCID`: OCI compartment ID for the Generative AI service (default: ``)
- `OCI_CONFIG_FILE_PATH`: OCI config file path (required if OCI_AUTH_TYPE is config_file) (default: `~/.oci/config`)
- `OCI_CLI_PROFILE`: OCI CLI profile name to use from config file (default: `DEFAULT`)
## Prerequisites
### Oracle Cloud Infrastructure Setup
Before using the OCI Generative AI distribution, ensure you have:
1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
4. **Authentication**: Configure authentication using either:
- **Instance Principal** (recommended for cloud-hosted deployments)
- **API Key** (for on-premises or development environments)
### Authentication Methods
#### Instance Principal Authentication (Recommended)
Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
Requirements:
- Instance must be running in an Oracle Cloud Infrastructure compartment
- Instance must have appropriate IAM policies to access Generative AI services
#### API Key Authentication
For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
### Required IAM Policies
Ensure your OCI user or instance has the following policy statements:
```
Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
```
## Supported Services
### Inference: OCI Generative AI
Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
- **Chat Completions**: Conversational AI with context awareness
- **Text Generation**: Complete prompts and generate text content
#### Available Models
Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
### Safety: Llama Guard
For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
- Content filtering and moderation
- Policy compliance checking
- Harmful content detection
### Vector Storage: Multiple Options
The distribution supports several vector storage providers:
- **FAISS**: Local in-memory vector search
- **ChromaDB**: Distributed vector database
- **PGVector**: PostgreSQL with vector extensions
### Additional Services
- **Dataset I/O**: Local filesystem and Hugging Face integration
- **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
- **Evaluation**: Meta reference evaluation framework
## Running Llama Stack with OCI
You can run the OCI distribution via Docker or local virtual environment.
### Via venv
If you've set up your local development environment, you can also build the image using your local virtual environment.
```bash
OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
```
### Configuration Examples
#### Using Instance Principal (Recommended for Production)
```bash
export OCI_AUTH_TYPE=instance_principal
export OCI_REGION=us-chicago-1
export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
```
#### Using API Key Authentication (Development)
```bash
export OCI_AUTH_TYPE=config_file
export OCI_CONFIG_FILE_PATH=~/.oci/config
export OCI_CLI_PROFILE=DEFAULT
export OCI_REGION=us-chicago-1
export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
```
## Regional Endpoints
OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
## Troubleshooting
### Common Issues
1. **Authentication Errors**: Verify your OCI credentials and IAM policies
2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
3. **Permission Denied**: Check compartment permissions and Generative AI service access
4. **Region Unavailable**: Verify the specified region supports Generative AI services
### Getting Help
For additional support:
- [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
- [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)

View file

@ -144,7 +144,7 @@ source .venv/bin/activate
```bash
uv venv client --python 3.12
source client/bin/activate
pip install llama-stack-client
uv pip install llama-stack-client
```
</TabItem>
</Tabs>

View file

@ -1,7 +1,8 @@
---
description: "Agents
description: |
Agents
APIs for creating and interacting with agentic systems."
APIs for creating and interacting with agentic systems.
sidebar_label: Agents
title: Agents
---

View file

@ -14,7 +14,7 @@ Meta's reference implementation of an agent system that can use tools, access ve
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `persistence` | `<class 'inline.agents.meta_reference.config.AgentPersistenceConfig'>` | No | | |
| `persistence` | `AgentPersistenceConfig` | No | | |
## Sample Configuration

View file

@ -1,5 +1,6 @@
---
description: "The Batches API enables efficient processing of multiple requests in a single operation,
description: |
The Batches API enables efficient processing of multiple requests in a single operation,
particularly useful for processing large datasets, batch evaluation workflows, and
cost-effective inference at scale.
@ -8,7 +9,7 @@ description: "The Batches API enables efficient processing of multiple requests
This API provides the following extensions:
- idempotent batch creation
Note: This API is currently under active development and may undergo changes."
Note: This API is currently under active development and may undergo changes.
sidebar_label: Batches
title: Batches
---

View file

@ -14,9 +14,9 @@ Reference implementation of batches API with KVStore persistence.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Configuration for the key-value store backend. |
| `max_concurrent_batches` | `<class 'int'>` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
| `max_concurrent_requests_per_batch` | `<class 'int'>` | No | 10 | Maximum number of concurrent requests to process per batch. |
| `kvstore` | `KVStoreReference` | No | | Configuration for the key-value store backend. |
| `max_concurrent_batches` | `int` | No | 1 | Maximum number of concurrent batches to process simultaneously. |
| `max_concurrent_requests_per_batch` | `int` | No | 10 | Maximum number of concurrent requests to process per batch. |
## Sample Configuration

View file

@ -14,7 +14,7 @@ Local filesystem-based dataset I/O provider for reading and writing datasets to
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
| `kvstore` | `KVStoreReference` | No | | |
## Sample Configuration

View file

@ -14,7 +14,7 @@ HuggingFace datasets provider for accessing and managing datasets from the Huggi
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
| `kvstore` | `KVStoreReference` | No | | |
## Sample Configuration

View file

@ -17,7 +17,7 @@ NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data platform
| `api_key` | `str \| None` | No | | The NVIDIA API key. |
| `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
| `project_id` | `str \| None` | No | test-project | The NVIDIA project ID. |
| `datasets_url` | `<class 'str'>` | No | http://nemo.test | Base URL for the NeMo Dataset API |
| `datasets_url` | `str` | No | http://nemo.test | Base URL for the NeMo Dataset API |
## Sample Configuration

View file

@ -1,7 +1,8 @@
---
description: "Evaluations
description: |
Evaluations
Llama Stack Evaluation API for running evaluations on model and agent candidates."
Llama Stack Evaluation API for running evaluations on model and agent candidates.
sidebar_label: Eval
title: Eval
---

View file

@ -14,7 +14,7 @@ Meta's reference implementation of evaluation tasks with support for multiple la
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `kvstore` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
| `kvstore` | `KVStoreReference` | No | | |
## Sample Configuration

View file

@ -14,7 +14,7 @@ NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
| `evaluator_url` | `str` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
## Sample Configuration

View file

@ -80,7 +80,7 @@ container_image: custom-vector-store:latest # optional
All providers must contain a `get_provider_spec` function in their `provider` module. This is a standardized structure that Llama Stack expects and is necessary for getting things such as the config class. The `get_provider_spec` method returns a structure identical to the `adapter`. An example function may look like:
```python
from llama_stack.providers.datatypes import (
from llama_stack_api.providers.datatypes import (
ProviderSpec,
Api,
RemoteProviderSpec,

View file

@ -0,0 +1,290 @@
---
sidebar_label: Files
title: Files
---
## Overview
The Files API provides file management capabilities for Llama Stack. It allows you to upload, store, retrieve, and manage files that can be used across various endpoints in your application.
## Features
- **File Upload**: Upload files with metadata and purpose classification
- **File Management**: List, retrieve, and delete files
- **Content Retrieval**: Access raw file content for processing
- **API Compatibility**: Full compatibility with OpenAI Files API endpoints
- **Flexible Storage**: Support for local filesystem and cloud storage backends
## API Endpoints
### Upload File
**POST** `/v1/openai/v1/files`
Upload a file that can be used across various endpoints.
**Request Body:**
- `file`: The file object to be uploaded (multipart form data)
- `purpose`: The intended purpose of the uploaded file
**Supported Purposes:**
- `batch`: Files for batch operations
**Response:**
```json
{
"id": "file-abc123",
"object": "file",
"bytes": 140,
"created_at": 1613779121,
"filename": "mydata.jsonl",
"purpose": "batch"
}
```
**Example:**
```python
import requests
with open("data.jsonl", "rb") as f:
files = {"file": f}
data = {"purpose": "batch"}
response = requests.post(
"http://localhost:8000/v1/openai/v1/files", files=files, data=data
)
file_info = response.json()
```
### List Files
**GET** `/v1/openai/v1/files`
Returns a list of files that belong to the user's organization.
**Query Parameters:**
- `after` (optional): A cursor for pagination
- `limit` (optional): Limit on number of objects (1-10,000, default: 10,000)
- `order` (optional): Sort order by created_at timestamp (`asc` or `desc`, default: `desc`)
- `purpose` (optional): Filter files by purpose
**Response:**
```json
{
"object": "list",
"data": [
{
"id": "file-abc123",
"object": "file",
"bytes": 140,
"created_at": 1613779121,
"filename": "mydata.jsonl",
"purpose": "fine-tune"
}
],
"has_more": false
}
```
**Example:**
```python
import requests
# List all files
response = requests.get("http://localhost:8000/v1/openai/v1/files")
files = response.json()
# List files with pagination
response = requests.get(
"http://localhost:8000/v1/openAi/v1/files",
params={"limit": 10, "after": "file-abc123"},
)
files = response.json()
# Filter by purpose
response = requests.get(
"http://localhost:8000/v1/openAi/v1/files", params={"purpose": "fine-tune"}
)
files = response.json()
```
### Retrieve File
**GET** `/v1/openAi/v1/files/{file_id}`
Returns information about a specific file.
**Path Parameters:**
- `file_id`: The ID of the file to retrieve
**Response:**
```json
{
"id": "file-abc123",
"object": "file",
"bytes": 140,
"created_at": 1613779121,
"filename": "mydata.jsonl",
"purpose": "fine-tune"
}
```
**Example:**
```python
import requests
file_id = "file-abc123"
response = requests.get(f"http://localhost:8000/v1/openAi/v1/files/{file_id}")
file_info = response.json()
```
### Delete File
**DELETE** `/v1/openAi/v1/files/{file_id}`
Delete a file.
**Path Parameters:**
- `file_id`: The ID of the file to delete
**Response:**
```json
{
"id": "file-abc123",
"object": "file",
"deleted": true
}
```
**Example:**
```python
import requests
file_id = "file-abc123"
response = requests.delete(f"http://localhost:8000/v1/openAi/v1/files/{file_id}")
result = response.json()
```
### Retrieve File Content
**GET** `/v1/openAi/v1/files/{file_id}/content`
Returns the raw file content as a binary response.
**Path Parameters:**
- `file_id`: The ID of the file to retrieve content from
**Response:**
Binary file content with appropriate headers:
- `Content-Type`: `application/octet-stream`
- `Content-Disposition`: `attachment; filename="filename"`
**Example:**
```python
import requests
file_id = "file-abc123"
response = requests.get(f"http://localhost:8000/v1/openAi/v1/files/{file_id}/content")
# Save content to file
with open("downloaded_file.jsonl", "wb") as f:
f.write(response.content)
# Or process content directly
content = response.content
```
## Vector Store Integration
The Files API integrates with Vector Stores to enable document processing and search. For detailed information about this integration, see [File Operations and Vector Store Integration](../concepts/file_operations_vector_stores.md).
### Vector Store File Operations
**List Vector Store Files:**
- **GET** `/v1/openAi/v1/vector_stores/{vector_store_id}/files`
**Retrieve Vector Store File Content:**
- **GET** `/v1/openAi/v1/vector_stores/{vector_store_id}/files/{file_id}/content`
**Attach File to Vector Store:**
- **POST** `/v1/openAi/v1/vector_stores/{vector_store_id}/files`
## Error Handling
The Files API returns standard HTTP status codes and error responses:
- `400 Bad Request`: Invalid request parameters
- `404 Not Found`: File not found
- `429 Too Many Requests`: Rate limit exceeded
- `500 Internal Server Error`: Server error
**Error Response Format:**
```json
{
"error": {
"message": "Error description",
"type": "invalid_request_error",
"code": "file_not_found"
}
}
```
## Rate Limits
The Files API implements rate limiting to ensure fair usage:
- File uploads: 100 files per minute
- File retrievals: 1000 requests per minute
- File deletions: 100 requests per minute
## Best Practices
1. **File Organization**: Use descriptive filenames and appropriate purpose classifications
2. **Batch Operations**: For multiple files, consider using batch endpoints when available
3. **Error Handling**: Always check response status codes and handle errors gracefully
4. **Content Types**: Ensure files are uploaded with appropriate content types
5. **Cleanup**: Regularly delete unused files to manage storage costs
## Integration Examples
### With Python Client
```python
from llama_stack import LlamaStackClient
client = LlamaStackClient("http://localhost:8000")
# Upload a file
with open("data.jsonl", "rb") as f:
file_info = await client.files.upload(file=f, purpose="fine-tune")
# List files
files = await client.files.list(purpose="fine-tune")
# Retrieve file content
content = await client.files.retrieve_content(file_info.id)
```
### With cURL
```bash
# Upload file
curl -X POST http://localhost:8000/v1/openAi/v1/files \
-F "file=@data.jsonl" \
-F "purpose=fine-tune"
# List files
curl http://localhost:8000/v1/openAi/v1/files
# Download file content
curl http://localhost:8000/v1/openAi/v1/files/file-abc123/content \
-o downloaded_file.jsonl
```
## Provider Support
The Files API supports multiple storage backends:
- **Local Filesystem**: Store files on local disk (inline provider)
- **S3**: Store files in AWS S3 or S3-compatible services (remote provider)
- **Custom Backends**: Extensible architecture for custom storage providers
See the [Files Providers](index.md) documentation for detailed configuration options.

View file

@ -1,7 +1,8 @@
---
description: "Files
description: |
Files
This API is used to upload documents that can be used with other Llama Stack APIs."
This API is used to upload documents that can be used with other Llama Stack APIs.
sidebar_label: Files
title: Files
---

View file

@ -14,9 +14,9 @@ Local filesystem-based file storage provider for managing files and documents lo
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `storage_dir` | `<class 'str'>` | No | | Directory to store uploaded files |
| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No | | SQL store configuration for file metadata |
| `ttl_secs` | `<class 'int'>` | No | 31536000 | |
| `storage_dir` | `str` | No | | Directory to store uploaded files |
| `metadata_store` | `SqlStoreReference` | No | | SQL store configuration for file metadata |
| `ttl_secs` | `int` | No | 31536000 | |
## Sample Configuration

View file

@ -0,0 +1,80 @@
# File Operations Quick Reference
## Overview
As of release 0.2.14, Llama Stack provides comprehensive file operations and Vector Store API integration, following the [OpenAI Vector Store Files API specification](https://platform.openai.com/docs/api-reference/vector-stores-files).
> **Note**: For detailed overview and implementation details, see [Overview](../openai_file_operations_support.md#overview) in the full documentation.
## Supported Providers
> **Note**: For complete provider details and features, see [Supported Providers](../openai_file_operations_support.md#supported-providers) in the full documentation.
**Inline Providers**: FAISS, SQLite-vec, Milvus
**Remote Providers**: ChromaDB, Qdrant, Weaviate, PGVector
## Quick Start
### 1. Upload File
```python
file_info = await client.files.upload(
file=open("document.pdf", "rb"), purpose="assistants"
)
```
### 2. Create Vector Store
```python
vector_store = client.vector_stores.create(name="my_docs")
```
### 3. Attach File
```python
await client.vector_stores.files.create(
vector_store_id=vector_store.id, file_id=file_info.id
)
```
### 4. Search
```python
results = await client.vector_stores.search(
vector_store_id=vector_store.id, query="What is the main topic?", max_num_results=5
)
```
## File Processing & Search
**Processing**: 800 tokens default chunk size, 400 token overlap
**Formats**: PDF, DOCX, TXT, Code files, etc.
**Search**: Vector similarity, Hybrid (SQLite-vec), Filtered with metadata
## Configuration
> **Note**: For detailed configuration examples and options, see [Configuration Examples](../openai_file_operations_support.md#configuration-examples) in the full documentation.
**Basic Setup**: Configure vector_io and files providers in your run.yaml
## Common Use Cases
- **RAG Systems**: Document Q&A with file uploads
- **Knowledge Bases**: Searchable document collections
- **Content Analysis**: Document similarity and clustering
- **Research Tools**: Literature review and analysis
## Performance Tips
> **Note**: For detailed performance optimization strategies, see [Performance Considerations](../openai_file_operations_support.md#performance-considerations) in the full documentation.
**Quick Tips**: Choose provider based on your needs (speed vs. storage vs. scalability)
## Troubleshooting
> **Note**: For comprehensive troubleshooting, see [Troubleshooting](../openai_file_operations_support.md#troubleshooting) in the full documentation.
**Quick Fixes**: Check file format compatibility, optimize chunk sizes, monitor storage
## Resources
- [Full Documentation](openai_file_operations_support.md)
- [Integration Guide](../concepts/file_operations_vector_stores.md)
- [Files API](files_api.md)
- [Provider Details](../vector_io/index.md)

View file

@ -0,0 +1,291 @@
# File Operations Support in Vector Store Providers
## Overview
This document provides a comprehensive overview of file operations and Vector Store API support across all available vector store providers in Llama Stack. As of release 0.2.24, the following providers support full file operations integration.
## Supported Providers
### ✅ Full File Operations Support
The following providers support complete file operations integration, including file upload, automatic processing, and search:
#### Inline Providers (Single Node)
| Provider | File Operations | Key Features |
|----------|----------------|--------------|
| **FAISS** | ✅ Full Support | Fast in-memory search, GPU acceleration |
| **SQLite-vec** | ✅ Full Support | Hybrid search, disk-based storage |
| **Milvus** | ✅ Full Support | High-performance, scalable indexing |
#### Remote Providers (Hosted)
| Provider | File Operations | Key Features |
|----------|----------------|--------------|
| **ChromaDB** | ✅ Full Support | Metadata filtering, persistent storage |
| **Qdrant** | ✅ Full Support | Payload filtering, advanced search |
| **Weaviate** | ✅ Full Support | GraphQL interface, schema management |
| **Postgres (PGVector)** | ✅ Full Support | SQL integration, ACID compliance |
### 🔄 Partial Support
Some providers may support basic vector operations but lack full file operations integration:
| Provider | Status | Notes |
|----------|--------|-------|
| **Meta Reference** | 🔄 Basic | Core vector operations only |
## File Operations Features
All supported providers offer the following file operations capabilities:
### Core Functionality
- **File Upload & Processing**: Automatic document ingestion and chunking
- **Vector Storage**: Embedding generation and storage
- **Search & Retrieval**: Semantic search with metadata filtering
- **File Management**: List, retrieve, and manage files in vector stores
### Advanced Features
- **Automatic Chunking**: Configurable chunk sizes and overlap
- **Metadata Preservation**: File attributes and chunk metadata
- **Status Tracking**: Monitor file processing progress
- **Error Handling**: Comprehensive error reporting and recovery
## Implementation Details
### File Processing Pipeline
1. **Upload**: File uploaded via Files API
2. **Extraction**: Text content extracted from various formats
3. **Chunking**: Content split into optimal chunks (default: 800 tokens)
4. **Embedding**: Chunks converted to vector embeddings
5. **Storage**: Vectors stored with metadata in vector database
6. **Indexing**: Search index updated for fast retrieval
### Supported File Formats
- **Documents**: PDF, DOCX, DOC
- **Text**: TXT, MD, RST
- **Code**: Python, JavaScript, Java, C++, etc.
- **Data**: JSON, CSV, XML
- **Web**: HTML files
### Chunking Strategies
- **Default**: 800 tokens with 400 token overlap
- **Custom**: Configurable chunk sizes and overlap
- **Static**: Fixed-size chunks with overlap
## Provider-Specific Features
### FAISS
- **Storage**: In-memory with optional persistence
- **Performance**: Optimized for speed and GPU acceleration
- **Use Case**: High-performance, memory-constrained environments
### SQLite-vec
- **Storage**: Disk-based with SQLite backend
- **Search**: Hybrid vector + keyword search
- **Use Case**: Large document collections, frequent updates
### Milvus
- **Storage**: Scalable distributed storage
- **Indexing**: Multiple index types (IVF, HNSW)
- **Use Case**: Production deployments, large-scale applications
### ChromaDB
- **Storage**: Persistent storage with metadata
- **Filtering**: Advanced metadata filtering
- **Use Case**: Applications requiring rich metadata
### Qdrant
- **Storage**: High-performance vector database
- **Filtering**: Payload-based filtering
- **Use Case**: Real-time applications, complex queries
### Weaviate
- **Storage**: GraphQL-native vector database
- **Schema**: Flexible schema management
- **Use Case**: Applications requiring complex data relationships
### Postgres (PGVector)
- **Storage**: SQL database with vector extensions
- **Integration**: ACID compliance, existing SQL workflows
- **Use Case**: Applications requiring transactional guarantees
## Configuration Examples
### Basic Configuration
```yaml
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
db_path: ~/.llama/faiss_store.db
```
### With FileResponse Support
```yaml
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
db_path: ~/.llama/faiss_store.db
files:
- provider_id: local-files
provider_type: inline::localfs
config:
storage_dir: ~/.llama/files
metadata_store:
type: sqlite
db_path: ~/.llama/files_metadata.db
```
## Usage Examples
### Python Client
```python
from llama_stack import LlamaStackClient
client = LlamaStackClient("http://localhost:8000")
# Create vector store
vector_store = client.vector_stores.create(name="documents")
# Upload and process file
with open("document.pdf", "rb") as f:
file_info = await client.files.upload(file=f, purpose="assistants")
# Attach to vector store
await client.vector_stores.files.create(
vector_store_id=vector_store.id, file_id=file_info.id
)
# Search
results = await client.vector_stores.search(
vector_store_id=vector_store.id, query="What is the main topic?", max_num_results=5
)
```
### cURL Commands
```bash
# Upload file
curl -X POST http://localhost:8000/v1/openai/v1/files \
-F "file=@document.pdf" \
-F "purpose=assistants"
# Create vector store
curl -X POST http://localhost:8000/v1/openai/v1/vector_stores \
-H "Content-Type: application/json" \
-d '{"name": "documents"}'
# Attach file to vector store
curl -X POST http://localhost:8000/v1/openai/v1/vector_stores/{store_id}/files \
-H "Content-Type: application/json" \
-d '{"file_id": "file-abc123"}'
# Search vector store
curl -X POST http://localhost:8000/v1/openai/v1/vector_stores/{store_id}/search \
-H "Content-Type: application/json" \
-d '{"query": "What is the main topic?", "max_num_results": 5}'
```
## Performance Considerations
### Chunk Size Optimization
- **Small chunks (400-600 tokens)**: Better precision, more results
- **Large chunks (800-1200 tokens)**: Better context, fewer results
- **Overlap (50%)**: Maintains context between chunks
### Storage Efficiency
- **FAISS**: Fastest, but memory-limited
- **SQLite-vec**: Good balance of performance and storage
- **Milvus**: Scalable, production-ready
- **Remote providers**: Managed, but network-dependent
### Search Performance
- **Vector search**: Fastest for semantic queries
- **Hybrid search**: Best accuracy (SQLite-vec only)
- **Filtered search**: Fast with metadata constraints
## Troubleshooting
### Common Issues
1. **File Processing Failures**
- Check file format compatibility
- Verify file size limits
- Review error messages in file status
2. **Search Performance**
- Optimize chunk sizes for your use case
- Use filters to narrow search scope
- Monitor vector store metrics
3. **Storage Issues**
- Check available disk space
- Verify database permissions
- Monitor memory usage (for in-memory providers)
### Monitoring
```python
# Check file processing status
file_status = await client.vector_stores.files.retrieve(
vector_store_id=vector_store.id, file_id=file_info.id
)
if file_status.status == "failed":
print(f"Error: {file_status.last_error.message}")
# Monitor vector store health
health = await client.vector_stores.health(vector_store_id=vector_store.id)
print(f"Status: {health.status}")
```
## Best Practices
1. **File Organization**: Use descriptive names and organize by purpose
2. **Chunking Strategy**: Test different sizes for your specific use case
3. **Metadata**: Add relevant attributes for better filtering
4. **Monitoring**: Track processing status and search performance
5. **Cleanup**: Regularly remove unused files to manage storage
## Future Enhancements
Planned improvements for file operations support:
- **Batch Processing**: Process multiple files simultaneously
- **Advanced Chunking**: More sophisticated chunking algorithms
- **Custom Embeddings**: Support for custom embedding models
- **Real-time Updates**: Live file processing and indexing
- **Multi-format Support**: Enhanced file format support
## Support and Resources
- **Documentation**: [File Operations and Vector Store Integration](../../concepts/file_operations_vector_stores.mdx)
- **API Reference**: [Files API](files_api.md)
- **Provider Docs**: [Vector Store Providers](../vector_io/index.md)
- **Examples**: [Getting Started](../getting_started/index.md)
- **Community**: [GitHub Discussions](https://github.com/meta-llama/llama-stack/discussions)

View file

@ -14,8 +14,8 @@ OpenAI Files API provider for managing files through OpenAI's native file storag
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `api_key` | `<class 'str'>` | No | | OpenAI API key for authentication |
| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No | | SQL store configuration for file metadata |
| `api_key` | `str` | No | | OpenAI API key for authentication |
| `metadata_store` | `SqlStoreReference` | No | | SQL store configuration for file metadata |
## Sample Configuration

View file

@ -14,13 +14,13 @@ AWS S3-based file storage provider for scalable cloud file management with metad
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `bucket_name` | `<class 'str'>` | No | | S3 bucket name to store files |
| `region` | `<class 'str'>` | No | us-east-1 | AWS region where the bucket is located |
| `bucket_name` | `str` | No | | S3 bucket name to store files |
| `region` | `str` | No | us-east-1 | AWS region where the bucket is located |
| `aws_access_key_id` | `str \| None` | No | | AWS access key ID (optional if using IAM roles) |
| `aws_secret_access_key` | `str \| None` | No | | AWS secret access key (optional if using IAM roles) |
| `endpoint_url` | `str \| None` | No | | Custom S3 endpoint URL (for MinIO, LocalStack, etc.) |
| `auto_create_bucket` | `<class 'bool'>` | No | False | Automatically create the S3 bucket if it doesn't exist |
| `metadata_store` | `<class 'llama_stack.core.storage.datatypes.SqlStoreReference'>` | No | | SQL store configuration for file metadata |
| `auto_create_bucket` | `bool` | No | False | Automatically create the S3 bucket if it doesn't exist |
| `metadata_store` | `SqlStoreReference` | No | | SQL store configuration for file metadata |
## Sample Configuration

View file

@ -22,6 +22,7 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
## Provider Categories
- **[External Providers](external/index.mdx)** - Guide for building and using external providers
- **[OpenAI Compatibility](../api-openai/index.mdx)** - OpenAI API compatibility layer
- **[Inference](inference/index.mdx)** - LLM and embedding model providers
- **[Agents](agents/index.mdx)** - Agentic system providers
- **[DatasetIO](datasetio/index.mdx)** - Dataset and data loader providers
@ -30,6 +31,16 @@ Importantly, Llama Stack always strives to provide at least one fully inline pro
- **[Tool Runtime](tool_runtime/index.mdx)** - Tool and protocol providers
- **[Files](files/index.mdx)** - File system and storage providers
## Other information about Providers
- **[OpenAI Compatibility](./openai.mdx)** - OpenAI API compatibility layer
## API Documentation
For comprehensive API documentation and reference:
- **[API Reference](../api/index.mdx)** - Complete API documentation
- **[Experimental APIs](../api-experimental/index.mdx)** - APIs in development
- **[Deprecated APIs](../api-deprecated/index.mdx)** - Legacy APIs being phased out
- **[OpenAI Compatibility](../api-openai/index.mdx)** - OpenAI API compatibility guide
## Additional Provider Information
- **[OpenAI Implementation Guide](./openai.mdx)** - Code examples and implementation details for OpenAI APIs
- **[OpenAI-Compatible Responses Limitations](./openai_responses_limitations.mdx)** - Known limitations of the Responses API in Llama Stack

View file

@ -1,12 +1,13 @@
---
description: "Inference
description: |
Inference
Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Three kinds of models are supported:
- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
- LLM models: these models generate "raw" and "chat" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search.
- Rerank models: these models reorder the documents based on their relevance to a query."
- Rerank models: these models reorder the documents based on their relevance to a query.
sidebar_label: Inference
title: Inference
---

View file

@ -16,12 +16,12 @@ Meta's reference implementation of inference with support for various model form
|-------|------|----------|---------|-------------|
| `model` | `str \| None` | No | | |
| `torch_seed` | `int \| None` | No | | |
| `max_seq_len` | `<class 'int'>` | No | 4096 | |
| `max_batch_size` | `<class 'int'>` | No | 1 | |
| `max_seq_len` | `int` | No | 4096 | |
| `max_batch_size` | `int` | No | 1 | |
| `model_parallel_size` | `int \| None` | No | | |
| `create_distributed_process_group` | `<class 'bool'>` | No | True | |
| `create_distributed_process_group` | `bool` | No | True | |
| `checkpoint_dir` | `str \| None` | No | | |
| `quantization` | `Bf16QuantizationConfig \| Fp8QuantizationConfig \| Int4QuantizationConfig, annotation=NoneType, required=True, discriminator='type'` | No | | |
| `quantization` | `Bf16QuantizationConfig \| Fp8QuantizationConfig \| Int4QuantizationConfig \| None` | No | | |
## Sample Configuration

View file

@ -14,9 +14,9 @@ Anthropic inference provider for accessing Claude models and Anthropic's AI serv
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
## Sample Configuration

View file

@ -21,10 +21,10 @@ https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `api_base` | `HttpUrl` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
| `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) |
| `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |

View file

@ -14,10 +14,10 @@ AWS Bedrock inference provider using OpenAI compatible endpoint.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `region_name` | `<class 'str'>` | No | us-east-2 | AWS Region for the Bedrock Runtime endpoint |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `region_name` | `str` | No | us-east-2 | AWS Region for the Bedrock Runtime endpoint |
## Sample Configuration

View file

@ -14,10 +14,10 @@ Cerebras inference provider for running models on Cerebras Cloud platform.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `base_url` | `<class 'str'>` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `base_url` | `str` | No | https://api.cerebras.ai | Base URL for the Cerebras API |
## Sample Configuration

View file

@ -14,9 +14,9 @@ Databricks inference provider for running models on Databricks' unified analytic
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_token` | `pydantic.types.SecretStr \| None` | No | | The Databricks API token |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_token` | `SecretStr \| None` | No | | The Databricks API token |
| `url` | `str \| None` | No | | The URL for the Databricks model serving endpoint |
## Sample Configuration

View file

@ -14,10 +14,10 @@ Fireworks AI inference provider for Llama models and other AI models on the Fire
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `<class 'str'>` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `str` | No | https://api.fireworks.ai/inference/v1 | The URL for the Fireworks server |
## Sample Configuration

View file

@ -14,9 +14,9 @@ Google Gemini inference provider for accessing Gemini models and Google's AI ser
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
## Sample Configuration

View file

@ -14,10 +14,10 @@ Groq inference provider for ultra-fast inference using Groq's LPU technology.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `<class 'str'>` | No | https://api.groq.com | The URL for the Groq AI server |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `str` | No | https://api.groq.com | The URL for the Groq AI server |
## Sample Configuration

View file

@ -14,8 +14,8 @@ HuggingFace Inference Endpoints provider for dedicated model serving.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `endpoint_name` | `<class 'str'>` | No | | The name of the Hugging Face Inference Endpoint in the format of '&#123;namespace&#125;/&#123;endpoint_name&#125;' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
| `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
| `endpoint_name` | `str` | No | | The name of the Hugging Face Inference Endpoint in the format of '&#123;namespace&#125;/&#123;endpoint_name&#125;' (e.g. 'my-cool-org/meta-llama-3-1-8b-instruct-rce'). Namespace is optional and will default to the user account if not provided. |
| `api_token` | `SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
## Sample Configuration

View file

@ -14,8 +14,8 @@ HuggingFace Inference API serverless provider for on-demand model inference.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `huggingface_repo` | `<class 'str'>` | No | | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
| `api_token` | `pydantic.types.SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
| `huggingface_repo` | `str` | No | | The model ID of the model on the Hugging Face Hub (e.g. 'meta-llama/Meta-Llama-3.1-70B-Instruct') |
| `api_token` | `SecretStr \| None` | No | | Your Hugging Face user access token (will default to locally saved token if not provided) |
## Sample Configuration

View file

@ -14,10 +14,10 @@ Llama OpenAI-compatible provider for using Llama models with OpenAI API format.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `openai_compat_api_base` | `<class 'str'>` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `openai_compat_api_base` | `str` | No | https://api.llama.com/compat/v1/ | The URL for the Llama API server |
## Sample Configuration

View file

@ -14,13 +14,13 @@ NVIDIA inference provider for accessing NVIDIA NIM models and AI services.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `<class 'str'>` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
| `append_api_version` | `<class 'bool'>` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. |
| `rerank_model_to_url` | `dict[str, str` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints. |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `str` | No | https://integrate.api.nvidia.com | A base url for accessing the NVIDIA NIM |
| `timeout` | `int` | No | 60 | Timeout for the HTTP requests |
| `append_api_version` | `bool` | No | True | When set to false, the API version will not be appended to the base_url. By default, it is true. |
| `rerank_model_to_url` | `dict[str, str]` | No | `{'nv-rerank-qa-mistral-4b:1': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking', 'nvidia/nv-rerankqa-mistral-4b-v3': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/nv-rerankqa-mistral-4b-v3/reranking', 'nvidia/llama-3.2-nv-rerankqa-1b-v2': 'https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking'}` | Mapping of rerank model identifiers to their API endpoints. |
## Sample Configuration

View file

@ -0,0 +1,41 @@
---
description: |
Oracle Cloud Infrastructure (OCI) Generative AI inference provider for accessing OCI's Generative AI Platform-as-a-Service models.
Provider documentation
https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm
sidebar_label: Remote - Oci
title: remote::oci
---
# remote::oci
## Description
Oracle Cloud Infrastructure (OCI) Generative AI inference provider for accessing OCI's Generative AI Platform-as-a-Service models.
Provider documentation
https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `oci_auth_type` | `str` | No | instance_principal | OCI authentication type (must be one of: instance_principal, config_file) |
| `oci_region` | `str` | No | us-ashburn-1 | OCI region (e.g., us-ashburn-1) |
| `oci_compartment_id` | `str` | No | | OCI compartment ID for the Generative AI service |
| `oci_config_file_path` | `str` | No | ~/.oci/config | OCI config file path (required if oci_auth_type is config_file) |
| `oci_config_profile` | `str` | No | DEFAULT | OCI config profile (required if oci_auth_type is config_file) |
## Sample Configuration
```yaml
oci_auth_type: ${env.OCI_AUTH_TYPE:=instance_principal}
oci_config_file_path: ${env.OCI_CONFIG_FILE_PATH:=~/.oci/config}
oci_config_profile: ${env.OCI_CLI_PROFILE:=DEFAULT}
oci_region: ${env.OCI_REGION:=us-ashburn-1}
oci_compartment_id: ${env.OCI_COMPARTMENT_OCID:=}
```

View file

@ -14,9 +14,9 @@ Ollama inference provider for running local models through the Ollama runtime.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | http://localhost:11434 | |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `url` | `str` | No | http://localhost:11434 | |
## Sample Configuration

View file

@ -14,10 +14,10 @@ OpenAI inference provider for accessing GPT models and other OpenAI services.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `base_url` | `<class 'str'>` | No | https://api.openai.com/v1 | Base URL for OpenAI API |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `base_url` | `str` | No | https://api.openai.com/v1 | Base URL for OpenAI API |
## Sample Configuration

View file

@ -14,10 +14,10 @@ Passthrough inference provider for connecting to any external inference service
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `<class 'str'>` | No | | The URL for the passthrough endpoint |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `str` | No | | The URL for the passthrough endpoint |
## Sample Configuration

View file

@ -14,9 +14,9 @@ RunPod inference provider for running models on RunPod's cloud GPU platform.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_token` | `pydantic.types.SecretStr \| None` | No | | The API token |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_token` | `SecretStr \| None` | No | | The API token |
| `url` | `str \| None` | No | | The URL for the Runpod model serving endpoint |
## Sample Configuration

View file

@ -14,10 +14,10 @@ SambaNova inference provider for running models on SambaNova's dataflow architec
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `str` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
## Sample Configuration

View file

@ -14,9 +14,9 @@ Text Generation Inference (TGI) provider for HuggingFace model serving.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `url` | `<class 'str'>` | No | | The URL for the TGI serving endpoint |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `url` | `str` | No | | The URL for the TGI serving endpoint |
## Sample Configuration

View file

@ -14,10 +14,10 @@ Together AI inference provider for open-source models and collaborative AI devel
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `<class 'str'>` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `str` | No | https://api.together.xyz/v1 | The URL for the Together AI server |
## Sample Configuration

View file

@ -53,10 +53,10 @@ Available Models:
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `project` | `<class 'str'>` | No | | Google Cloud project ID for Vertex AI |
| `location` | `<class 'str'>` | No | us-central1 | Google Cloud location for Vertex AI |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `project` | `str` | No | | Google Cloud project ID for Vertex AI |
| `location` | `str` | No | us-central1 | Google Cloud location for Vertex AI |
## Sample Configuration

View file

@ -14,11 +14,11 @@ Remote vLLM inference provider for connecting to vLLM servers.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_token` | `pydantic.types.SecretStr \| None` | No | | The API token |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_token` | `SecretStr \| None` | No | | The API token |
| `url` | `str \| None` | No | | The URL for the vLLM model serving endpoint |
| `max_tokens` | `<class 'int'>` | No | 4096 | Maximum number of tokens to generate. |
| `max_tokens` | `int` | No | 4096 | Maximum number of tokens to generate. |
| `tls_verify` | `bool \| str` | No | True | Whether to verify TLS certificates. Can be a boolean or a path to a CA certificate file. |
## Sample Configuration

View file

@ -14,12 +14,12 @@ IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `<class 'str'>` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `api_key` | `SecretStr \| None` | No | | Authentication credential for the provider |
| `url` | `str` | No | https://us-south.ml.cloud.ibm.com | A base url for accessing the watsonx.ai |
| `project_id` | `str \| None` | No | | The watsonx.ai project ID |
| `timeout` | `<class 'int'>` | No | 60 | Timeout for the HTTP requests |
| `timeout` | `int` | No | 60 | Timeout for the HTTP requests |
## Sample Configuration

View file

@ -1,9 +1,14 @@
---
title: OpenAI Compatibility
description: OpenAI API Compatibility
sidebar_label: OpenAI Compatibility
sidebar_position: 1
title: OpenAI Implementation Guide
description: Code examples and implementation details for OpenAI API compatibility
sidebar_label: OpenAI Implementation
sidebar_position: 2
---
# OpenAI Implementation Guide
This guide provides detailed code examples and implementation details for using OpenAI-compatible APIs with Llama Stack. For a comprehensive overview of OpenAI compatibility features, see our [OpenAI API Compatibility Guide](../api-openai/index.mdx).
## OpenAI API Compatibility
### Server path
@ -195,3 +200,9 @@ Lines of code unfurl
Logic whispers in the dark
Art in hidden form
```
## Additional Resources
- **[OpenAI API Compatibility Guide](../api-openai/index.mdx)** - Comprehensive overview of OpenAI compatibility features
- **[OpenAI Responses API Limitations](./openai_responses_limitations.mdx)** - Detailed limitations and known issues
- **[Provider Documentation](../index.mdx)** - Complete provider ecosystem overview

View file

@ -14,23 +14,23 @@ HuggingFace-based post-training provider for fine-tuning models using the Huggin
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `device` | `<class 'str'>` | No | cuda | |
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
| `chat_template` | `<class 'str'>` | No | `&lt;|user|&gt;`<br/>`{input}`<br/>`&lt;|assistant|&gt;`<br/>`{output}` | |
| `model_specific_config` | `<class 'dict'>` | No | `{'trust_remote_code': True, 'attn_implementation': 'sdpa'}` | |
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
| `save_total_limit` | `<class 'int'>` | No | 3 | |
| `logging_steps` | `<class 'int'>` | No | 10 | |
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
| `dpo_beta` | `<class 'float'>` | No | 0.1 | |
| `use_reference_model` | `<class 'bool'>` | No | True | |
| `dpo_loss_type` | `Literal['sigmoid', 'hinge', 'ipo', 'kto_pair'` | No | sigmoid | |
| `dpo_output_dir` | `<class 'str'>` | No | | |
| `device` | `str` | No | cuda | |
| `distributed_backend` | `Literal[fsdp, deepspeed] \| None` | No | | |
| `checkpoint_format` | `Literal[full_state, huggingface] \| None` | No | huggingface | |
| `chat_template` | `str` | No | `&lt;|user|&gt;`<br/>`{input}`<br/>`&lt;|assistant|&gt;`<br/>`{output}` | |
| `model_specific_config` | `dict` | No | `{'trust_remote_code': True, 'attn_implementation': 'sdpa'}` | |
| `max_seq_length` | `int` | No | 2048 | |
| `gradient_checkpointing` | `bool` | No | False | |
| `save_total_limit` | `int` | No | 3 | |
| `logging_steps` | `int` | No | 10 | |
| `warmup_ratio` | `float` | No | 0.1 | |
| `weight_decay` | `float` | No | 0.01 | |
| `dataloader_num_workers` | `int` | No | 4 | |
| `dataloader_pin_memory` | `bool` | No | True | |
| `dpo_beta` | `float` | No | 0.1 | |
| `use_reference_model` | `bool` | No | True | |
| `dpo_loss_type` | `Literal[sigmoid, hinge, ipo, kto_pair]` | No | sigmoid | |
| `dpo_output_dir` | `str` | No | | |
## Sample Configuration

View file

@ -15,7 +15,7 @@ TorchTune-based post-training provider for fine-tuning and optimizing models usi
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `torch_seed` | `int \| None` | No | | |
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
| `checkpoint_format` | `Literal[meta, huggingface] \| None` | No | meta | |
## Sample Configuration

View file

@ -15,7 +15,7 @@ TorchTune-based post-training provider for fine-tuning and optimizing models usi
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `torch_seed` | `int \| None` | No | | |
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
| `checkpoint_format` | `Literal[meta, huggingface] \| None` | No | meta | |
## Sample Configuration

View file

@ -18,9 +18,9 @@ NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
| `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
| `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
| `customizer_url` | `str \| None` | No | | Base URL for the NeMo Customizer API |
| `timeout` | `<class 'int'>` | No | 300 | Timeout for the NVIDIA Post Training API |
| `max_retries` | `<class 'int'>` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
| `output_model_dir` | `<class 'str'>` | No | test-example-model@v1 | Directory to save the output model |
| `timeout` | `int` | No | 300 | Timeout for the NVIDIA Post Training API |
| `max_retries` | `int` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
| `output_model_dir` | `str` | No | test-example-model@v1 | Directory to save the output model |
## Sample Configuration

View file

@ -1,7 +1,8 @@
---
description: "Safety
description: |
Safety
OpenAI-compatible Moderations API."
OpenAI-compatible Moderations API.
sidebar_label: Safety
title: Safety
---

View file

@ -14,7 +14,7 @@ Llama Guard safety provider for content moderation and safety filtering using Me
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `excluded_categories` | `list[str` | No | [] | |
| `excluded_categories` | `list[str]` | No | [] | |
## Sample Configuration

View file

@ -14,7 +14,7 @@ Prompt Guard safety provider for detecting and filtering unsafe prompts and cont
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `guard_type` | `<class 'str'>` | No | injection | |
| `guard_type` | `str` | No | injection | |
## Sample Configuration

View file

@ -14,8 +14,8 @@ AWS Bedrock safety provider for content moderation using AWS's safety services.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `allowed_models` | `list[str \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `<class 'bool'>` | No | False | Whether to refresh models periodically from the provider |
| `allowed_models` | `list[str] \| None` | No | | List of models that should be registered with the model registry. If None, all models are allowed. |
| `refresh_models` | `bool` | No | False | Whether to refresh models periodically from the provider |
| `aws_access_key_id` | `str \| None` | No | | The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID |
| `aws_secret_access_key` | `str \| None` | No | | The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY |
| `aws_session_token` | `str \| None` | No | | The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN |

View file

@ -14,7 +14,7 @@ NVIDIA's safety provider for content moderation and safety filtering.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `guardrails_service_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the Guardrails service |
| `guardrails_service_url` | `str` | No | http://0.0.0.0:7331 | The url for accessing the Guardrails service |
| `config_id` | `str \| None` | No | self-check | Guardrails configuration ID to use from the Guardrails configuration store |
## Sample Configuration

View file

@ -14,8 +14,8 @@ SambaNova's safety provider for content moderation and safety filtering.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `url` | `<class 'str'>` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
| `api_key` | `pydantic.types.SecretStr \| None` | No | | The SambaNova cloud API Key |
| `url` | `str` | No | https://api.sambanova.ai/v1 | The URL for the SambaNova AI server |
| `api_key` | `SecretStr \| None` | No | | The SambaNova cloud API Key |
## Sample Configuration

View file

@ -15,7 +15,7 @@ Bing Search tool for web search capabilities using Microsoft's search engine.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `api_key` | `str \| None` | No | | |
| `top_k` | `<class 'int'>` | No | 3 | |
| `top_k` | `int` | No | 3 | |
## Sample Configuration

View file

@ -15,7 +15,7 @@ Brave Search tool for web search capabilities with privacy-focused results.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `api_key` | `str \| None` | No | | The Brave Search API Key |
| `max_results` | `<class 'int'>` | No | 3 | The maximum number of results to return |
| `max_results` | `int` | No | 3 | The maximum number of results to return |
## Sample Configuration

View file

@ -15,7 +15,7 @@ Tavily Search tool for AI-optimized web search with structured results.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `api_key` | `str \| None` | No | | The Tavily Search API Key |
| `max_results` | `<class 'int'>` | No | 3 | The maximum number of results to return |
| `max_results` | `int` | No | 3 | The maximum number of results to return |
## Sample Configuration

View file

@ -78,8 +78,8 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | | |
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend |
| `db_path` | `str` | No | | |
| `persistence` | `KVStoreReference` | No | | Config for KV store backend |
## Sample Configuration

View file

@ -95,7 +95,7 @@ more details about Faiss in general.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
| `persistence` | `KVStoreReference` | No | | |
## Sample Configuration

View file

@ -14,7 +14,7 @@ Meta's reference implementation of a vector database.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
| `persistence` | `KVStoreReference` | No | | |
## Sample Configuration

View file

@ -16,9 +16,9 @@ Please refer to the remote provider documentation.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | | |
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend (SQLite only for now) |
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
| `db_path` | `str` | No | | |
| `persistence` | `KVStoreReference` | No | | Config for KV store backend (SQLite only for now) |
| `consistency_level` | `str` | No | Strong | The consistency level of the Milvus server |
## Sample Configuration

View file

@ -97,8 +97,8 @@ See the [Qdrant documentation](https://qdrant.tech/documentation/) for more deta
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `path` | `<class 'str'>` | No | | |
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
| `path` | `str` | No | | |
| `persistence` | `KVStoreReference` | No | | |
## Sample Configuration

View file

@ -153,7 +153,7 @@ description: |
Example using RAGQueryConfig with different search modes:
```python
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
from llama_stack_api import RAGQueryConfig, RRFRanker, WeightedRanker
# Vector search
config = RAGQueryConfig(mode="vector", max_chunks=5)
@ -358,7 +358,7 @@ Two ranker types are supported:
Example using RAGQueryConfig with different search modes:
```python
from llama_stack.apis.tools import RAGQueryConfig, RRFRanker, WeightedRanker
from llama_stack_api import RAGQueryConfig, RRFRanker, WeightedRanker
# Vector search
config = RAGQueryConfig(mode="vector", max_chunks=5)
@ -407,8 +407,8 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | | Path to the SQLite database file |
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend (SQLite only for now) |
| `db_path` | `str` | No | | Path to the SQLite database file |
| `persistence` | `KVStoreReference` | No | | Config for KV store backend (SQLite only for now) |
## Sample Configuration

View file

@ -16,8 +16,8 @@ Please refer to the sqlite-vec provider documentation.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | | Path to the SQLite database file |
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend (SQLite only for now) |
| `db_path` | `str` | No | | Path to the SQLite database file |
| `persistence` | `KVStoreReference` | No | | Config for KV store backend (SQLite only for now) |
## Sample Configuration

View file

@ -78,7 +78,7 @@ See [Chroma's documentation](https://docs.trychroma.com/docs/overview/introducti
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `url` | `str \| None` | No | | |
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend |
| `persistence` | `KVStoreReference` | No | | Config for KV store backend |
## Sample Configuration

View file

@ -405,10 +405,10 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `uri` | `<class 'str'>` | No | | The URI of the Milvus server |
| `uri` | `str` | No | | The URI of the Milvus server |
| `token` | `str \| None` | No | | The token of the Milvus server |
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | Config for KV store backend |
| `consistency_level` | `str` | No | Strong | The consistency level of the Milvus server |
| `persistence` | `KVStoreReference` | No | | Config for KV store backend |
| `config` | `dict` | No | `{}` | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
:::note

View file

@ -218,7 +218,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
| `db` | `str \| None` | No | postgres | |
| `user` | `str \| None` | No | postgres | |
| `password` | `str \| None` | No | mysecretpassword | |
| `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No | | Config for KV store backend (SQLite only for now) |
| `persistence` | `KVStoreReference \| None` | No | | Config for KV store backend (SQLite only for now) |
## Sample Configuration

View file

@ -19,14 +19,14 @@ Please refer to the inline provider documentation.
| `location` | `str \| None` | No | | |
| `url` | `str \| None` | No | | |
| `port` | `int \| None` | No | 6333 | |
| `grpc_port` | `<class 'int'>` | No | 6334 | |
| `prefer_grpc` | `<class 'bool'>` | No | False | |
| `grpc_port` | `int` | No | 6334 | |
| `prefer_grpc` | `bool` | No | False | |
| `https` | `bool \| None` | No | | |
| `api_key` | `str \| None` | No | | |
| `prefix` | `str \| None` | No | | |
| `timeout` | `int \| None` | No | | |
| `host` | `str \| None` | No | | |
| `persistence` | `<class 'llama_stack.core.storage.datatypes.KVStoreReference'>` | No | | |
| `persistence` | `KVStoreReference` | No | | |
## Sample Configuration

View file

@ -75,7 +75,7 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more
|-------|------|----------|---------|-------------|
| `weaviate_api_key` | `str \| None` | No | | The API key for the Weaviate instance |
| `weaviate_cluster_url` | `str \| None` | No | localhost:8080 | The URL of the Weaviate cluster |
| `persistence` | `llama_stack.core.storage.datatypes.KVStoreReference \| None` | No | | Config for KV store backend (SQLite only for now) |
| `persistence` | `KVStoreReference \| None` | No | | Config for KV store backend (SQLite only for now) |
## Sample Configuration

View file

@ -1 +0,0 @@
The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack.core/server/endpoints.py` using the `generate.py` utility.

View file

@ -1,134 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described found in the
# LICENSE file in the root directory of this source tree.
from datetime import datetime
from pathlib import Path
import sys
import fire
import ruamel.yaml as yaml
from llama_stack.apis.version import LLAMA_STACK_API_V1 # noqa: E402
from llama_stack.core.stack import LlamaStack # noqa: E402
from .pyopenapi.options import Options # noqa: E402
from .pyopenapi.specification import Info, Server # noqa: E402
from .pyopenapi.utility import Specification, validate_api # noqa: E402
def str_presenter(dumper, data):
if data.startswith(f"/{LLAMA_STACK_API_V1}") or data.startswith(
"#/components/schemas/"
):
style = None
else:
style = ">" if "\n" in data or len(data) > 40 else None
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
def generate_spec(output_dir: Path, stability_filter: str = None, main_spec: bool = False, combined_spec: bool = False):
"""Generate OpenAPI spec with optional stability filtering."""
if combined_spec:
# Special case for combined stable + experimental APIs
title_suffix = " - Stable & Experimental APIs"
filename_prefix = "stainless-"
description_suffix = "\n\n**🔗 COMBINED**: This specification includes both stable production-ready APIs and experimental pre-release APIs. Use stable APIs for production deployments and experimental APIs for testing new features."
# Use the special "stainless" filter to include stable + experimental APIs
stability_filter = "stainless"
elif stability_filter:
title_suffix = {
"stable": " - Stable APIs" if not main_spec else "",
"experimental": " - Experimental APIs",
"deprecated": " - Deprecated APIs"
}.get(stability_filter, f" - {stability_filter.title()} APIs")
# Use main spec filename for stable when main_spec=True
if main_spec and stability_filter == "stable":
filename_prefix = ""
else:
filename_prefix = f"{stability_filter}-"
description_suffix = {
"stable": "\n\n**✅ STABLE**: Production-ready APIs with backward compatibility guarantees.",
"experimental": "\n\n**🧪 EXPERIMENTAL**: Pre-release APIs (v1alpha, v1beta) that may change before becoming stable.",
"deprecated": "\n\n**⚠️ DEPRECATED**: Legacy APIs that may be removed in future versions. Use for migration reference only."
}.get(stability_filter, "")
else:
title_suffix = ""
filename_prefix = ""
description_suffix = ""
spec = Specification(
LlamaStack,
Options(
server=Server(url="http://any-hosted-llama-stack.com"),
info=Info(
title=f"Llama Stack Specification{title_suffix}",
version=LLAMA_STACK_API_V1,
description=f"""This is the specification of the Llama Stack that provides
a set of endpoints and their corresponding interfaces that are tailored to
best leverage Llama Models.{description_suffix}""",
),
include_standard_error_responses=True,
stability_filter=stability_filter, # Pass the filter to the generator
),
)
yaml_filename = f"{filename_prefix}llama-stack-spec.yaml"
with open(output_dir / yaml_filename, "w", encoding="utf-8") as fp:
y = yaml.YAML()
y.default_flow_style = False
y.block_seq_indent = 2
y.map_indent = 2
y.sequence_indent = 4
y.sequence_dash_offset = 2
y.width = 80
y.allow_unicode = True
y.representer.add_representer(str, str_presenter)
y.dump(
spec.get_json(),
fp,
)
def main(output_dir: str):
output_dir = Path(output_dir)
if not output_dir.exists():
raise ValueError(f"Directory {output_dir} does not exist")
# Validate API protocols before generating spec
return_type_errors = validate_api()
if return_type_errors:
print("\nAPI Method Return Type Validation Errors:\n")
for error in return_type_errors:
print(error, file=sys.stderr)
sys.exit(1)
now = str(datetime.now())
print(f"Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at {now}")
print("")
# Generate main spec as stable APIs (llama-stack-spec.yaml)
print("Generating main specification (stable APIs)...")
generate_spec(output_dir, "stable", main_spec=True)
print("Generating other stability-filtered specifications...")
generate_spec(output_dir, "experimental")
generate_spec(output_dir, "deprecated")
print("Generating combined stable + experimental specification...")
generate_spec(output_dir, combined_spec=True)
if __name__ == "__main__":
fire.Fire(main)

View file

@ -1 +0,0 @@
This is forked from https://github.com/hunyadi/pyopenapi

File diff suppressed because it is too large Load diff

Some files were not shown because too many files have changed in this diff Show more