Merge branch 'main' into feat/litellm_sambanova_usage

This commit is contained in:
jhpiedrahitao 2025-04-01 07:57:21 -05:00
commit 9c9f9577e2
173 changed files with 3073 additions and 3118 deletions

2
.github/CODEOWNERS vendored
View file

@ -2,4 +2,4 @@
# These owners will be the default owners for everything in # These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence, # the repo. Unless a later match takes precedence,
* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 * @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 @leseb

View file

@ -25,7 +25,8 @@ jobs:
matrix: matrix:
# Listing tests manually since some of them currently fail # Listing tests manually since some of them currently fail
# TODO: generate matrix list from tests/integration when fixed # TODO: generate matrix list from tests/integration when fixed
test-type: [inference, datasets, inspect, scoring, post_training, providers] test-type: [agents, inference, datasets, inspect, scoring, post_training, providers]
client-type: [library, http]
fail-fast: false # we want to run all tests regardless of failure fail-fast: false # we want to run all tests regardless of failure
steps: steps:
@ -54,6 +55,8 @@ jobs:
uv sync --extra dev --extra test uv sync --extra dev --extra test
uv pip install ollama faiss-cpu uv pip install ollama faiss-cpu
# always test against the latest version of the client # always test against the latest version of the client
# TODO: this is not necessarily a good idea. we need to test against both published and latest
# to find out backwards compatibility issues.
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
uv pip install -e . uv pip install -e .
llama stack build --template ollama --image-type venv llama stack build --template ollama --image-type venv
@ -74,6 +77,7 @@ jobs:
exit 1 exit 1
- name: Start Llama Stack server in background - name: Start Llama Stack server in background
if: matrix.client-type == 'http'
env: env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
run: | run: |
@ -81,6 +85,7 @@ jobs:
nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 & nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
- name: Wait for Llama Stack server to be ready - name: Wait for Llama Stack server to be ready
if: matrix.client-type == 'http'
run: | run: |
echo "Waiting for Llama Stack server..." echo "Waiting for Llama Stack server..."
for i in {1..30}; do for i in {1..30}; do
@ -98,4 +103,12 @@ jobs:
env: env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
run: | run: |
uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2 if [ "${{ matrix.client-type }}" == "library" ]; then
stack_config="ollama"
else
stack_config="http://localhost:8321"
fi
uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
-k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
--text-model="meta-llama/Llama-3.2-3B-Instruct" \
--embedding-model=all-MiniLM-L6-v2

View file

@ -1,5 +1,76 @@
# Changelog # Changelog
# v0.1.8
Published on: 2025-03-24T01:28:50Z
# v0.1.8 Release Notes
### Build and Test Agents
* Safety: Integrated NVIDIA as a safety provider.
* VectorDB: Added Qdrant as an inline provider.
* Agents: Added support for multiple tool groups in agents.
* Agents: Simplified imports for Agents in client package
### Agent Evals and Model Customization
* Introduced DocVQA and IfEval benchmarks.
### Deploying and Monitoring Agents
* Introduced a Containerfile and image workflow for the Playground.
* Implemented support for Bearer (API Key) authentication.
* Added attribute-based access control for resources.
* Fixes on docker deployments: use --pull always and standardized the default port to 8321
* Deprecated: /v1/inspect/providers use /v1/providers/ instead
### Better Engineering
* Consolidated scripts under the ./scripts directory.
* Addressed mypy violations in various modules.
* Added Dependabot scans for Python dependencies.
* Implemented a scheduled workflow to update the changelog automatically.
* Enforced concurrency to reduce CI loads.
### New Contributors
* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
---
# v0.1.7
Published on: 2025-03-14T22:30:51Z
## 0.1.7 Release Notes
### Build and Test Agents
* Inference: ImageType is now refactored to LlamaStackImageType
* Inference: Added tests to measure TTFT
* Inference: Bring back usage metrics
* Agents: Added endpoint for get agent, list agents and list sessions
* Agents: Automated conversion of type hints in client tool for lite llm format
* Agents: Deprecated ToolResponseMessage in agent.resume API
* Added Provider API for listing and inspecting provider info
### Agent Evals and Model Customization
* Eval: Added new eval benchmarks Math 500 and BFCL v3
* Deploy and Monitoring of Agents
* Telemetry: Fix tracing to work across coroutines
### Better Engineering
* Display code coverage for unit tests
* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
* Unit tests also run on Python 3.11, 3.12, and 3.13
* Added ollama inference to Integration tests CI
* Improved documentation across examples, testing, CLI, updated providers table )
---
# v0.1.6 # v0.1.6
Published on: 2025-03-08T04:35:08Z Published on: 2025-03-08T04:35:08Z

View file

@ -81,12 +81,14 @@ Note that you can create a dotenv file `.env` that includes necessary environmen
LLAMA_STACK_BASE_URL=http://localhost:8321 LLAMA_STACK_BASE_URL=http://localhost:8321
LLAMA_STACK_CLIENT_LOG=debug LLAMA_STACK_CLIENT_LOG=debug
LLAMA_STACK_PORT=8321 LLAMA_STACK_PORT=8321
LLAMA_STACK_CONFIG= LLAMA_STACK_CONFIG=<provider-name>
TAVILY_SEARCH_API_KEY=
BRAVE_SEARCH_API_KEY=
``` ```
And then use this dotenv file when running client SDK tests via the following: And then use this dotenv file when running client SDK tests via the following:
```bash ```bash
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
``` ```
## Pre-commit Hooks ## Pre-commit Hooks
@ -124,6 +126,10 @@ source .venv/bin/activate
PYTHON_VERSION=3.13 ./scripts/unit-tests.sh PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
``` ```
## Running integration tests
You can run integration tests following the instructions [here](tests/integration/README.md).
## Adding a new dependency to the project ## Adding a new dependency to the project
To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run: To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:

View file

@ -1,5 +1,5 @@
include pyproject.toml include pyproject.toml
include distributions/dependencies.json include llama_stack/templates/dependencies.json
include llama_stack/models/llama/llama3/tokenizer.model include llama_stack/models/llama/llama3/tokenizer.model
include llama_stack/distribution/*.sh include llama_stack/distribution/*.sh
include llama_stack/cli/scripts/*.sh include llama_stack/cli/scripts/*.sh

View file

@ -1 +0,0 @@
../../llama_stack/templates/bedrock/build.yaml

View file

@ -1,15 +0,0 @@
services:
llamastack:
image: distribution-bedrock
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-bedrock.yaml
ports:
- "8321:8321"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1 +0,0 @@
../../llama_stack/templates/bedrock/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/cerebras/build.yaml

View file

@ -1,16 +0,0 @@
services:
llamastack:
image: llamastack/distribution-cerebras
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-cerebras.yaml
ports:
- "8321:8321"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1 +0,0 @@
../../llama_stack/templates/cerebras/run.yaml

View file

@ -1,50 +0,0 @@
services:
text-generation-inference:
image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
network_mode: "host"
volumes:
- $HOME/.cache/huggingface:/data
ports:
- "5009:5009"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0,1,2,3,4
- NUM_SHARD=4
- MAX_BATCH_PREFILL_TOKENS=32768
- MAX_INPUT_TOKENS=8000
- MAX_TOTAL_TOKENS=8192
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: all
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
text-generation-inference:
condition: service_healthy
image: llamastack/distribution-tgi
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to TGI run.yaml file
- ./run.yaml:/root/my-run.yaml
ports:
- "8321:8321"
# Hack: wait for TGI server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1,44 +0,0 @@
version: '2'
image_name: local
container_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:80
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::faiss
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -1 +0,0 @@
../../llama_stack/templates/fireworks/build.yaml

View file

@ -1,14 +0,0 @@
services:
llamastack:
image: llamastack/distribution-fireworks
ports:
- "8321:8321"
environment:
- FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1 +0,0 @@
../../llama_stack/templates/fireworks/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/meta-reference-gpu/build.yaml

View file

@ -1,34 +0,0 @@
services:
llamastack:
image: llamastack/distribution-meta-reference-gpu
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/my-run.yaml
ports:
- "8321:8321"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
runtime: nvidia
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"

View file

@ -1 +0,0 @@
../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/meta-reference-gpu/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml

View file

@ -1,35 +0,0 @@
services:
llamastack:
image: llamastack/distribution-meta-reference-quantized-gpu
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/my-run.yaml
ports:
- "8321:8321"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1,58 +0,0 @@
version: '2'
image_name: local
container_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: meta0
provider_type: inline::meta-reference-quantized
config:
model: Llama3.2-3B-Instruct:int4-qlora-eo8
quantization:
type: int4
torch_seed: null
max_seq_len: 2048
max_batch_size: 1
- provider_id: meta1
provider_type: inline::meta-reference-quantized
config:
# not a quantized model !
model: Llama-Guard-3-1B
quantization: null
torch_seed: null
max_seq_len: 2048
max_batch_size: 1
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -1 +0,0 @@
../../llama_stack/templates/ollama/build.yaml

View file

@ -1,71 +0,0 @@
services:
ollama:
image: ollama/ollama:latest
network_mode: ${NETWORK_MODE:-bridge}
volumes:
- ~/.ollama:/root/.ollama
ports:
- "11434:11434"
environment:
OLLAMA_DEBUG: 1
command: []
deploy:
resources:
limits:
memory: 8G # Set maximum memory
reservations:
memory: 8G # Set minimum memory reservation
# healthcheck:
# # ugh, no CURL in ollama image
# test: ["CMD", "curl", "-f", "http://ollama:11434"]
# interval: 10s
# timeout: 5s
# retries: 5
ollama-init:
image: ollama/ollama:latest
depends_on:
- ollama
# condition: service_healthy
network_mode: ${NETWORK_MODE:-bridge}
environment:
- OLLAMA_HOST=ollama
- INFERENCE_MODEL=${INFERENCE_MODEL}
- SAFETY_MODEL=${SAFETY_MODEL:-}
volumes:
- ~/.ollama:/root/.ollama
- ./pull-models.sh:/pull-models.sh
entrypoint: ["/pull-models.sh"]
llamastack:
depends_on:
ollama:
condition: service_started
ollama-init:
condition: service_started
image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
network_mode: ${NETWORK_MODE:-bridge}
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ~/local/llama-stack/:/app/llama-stack-source
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
ports:
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
environment:
- INFERENCE_MODEL=${INFERENCE_MODEL}
- SAFETY_MODEL=${SAFETY_MODEL:-}
- OLLAMA_URL=http://ollama:11434
entrypoint: >
python -m llama_stack.distribution.server.server /root/my-run.yaml \
--port ${LLAMA_STACK_PORT:-8321}
deploy:
restart_policy:
condition: on-failure
delay: 10s
max_attempts: 3
window: 60s
volumes:
ollama:
ollama-init:
llamastack:

View file

@ -1,18 +0,0 @@
#!/bin/sh
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
echo "Preloading $model..."
if ! ollama run "$model"; then
echo "Failed to pull and run $model"
exit 1
fi
done
echo "All models pulled successfully"

View file

@ -1 +0,0 @@
../../llama_stack/templates/ollama/run-with-safety.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/ollama/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/nvidia/build.yaml

View file

@ -1,19 +0,0 @@
services:
llamastack:
image: distribution-nvidia:dev
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-nvidia.yaml
ports:
- "8321:8321"
environment:
- INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
- NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1 +0,0 @@
../../llama_stack/templates/nvidia/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/remote-vllm/build.yaml

View file

@ -1,99 +0,0 @@
services:
vllm-inference:
image: vllm/vllm-openai:latest
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port ${VLLM_INFERENCE_PORT:-5100}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
interval: 30s
timeout: 10s
retries: 5
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
# A little trick:
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
vllm-${VLLM_SAFETY_MODEL:+safety}:
image: vllm/vllm-openai:latest
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model ${VLLM_SAFETY_MODEL}
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port ${VLLM_SAFETY_PORT:-5101}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
interval: 30s
timeout: 10s
retries: 5
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
- vllm-inference:
condition: service_healthy
- vllm-${VLLM_SAFETY_MODEL:+safety}:
condition: service_healthy
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
volumes:
- ~/.llama:/root/.llama
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
network_mode: ${NETWORK_MODE:-bridged}
environment:
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
- MAX_TOKENS=${MAX_TOKENS:-4096}
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
ports:
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
# Hack: wait for vLLM server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
volumes:
vllm-inference:
vllm-safety:
llamastack:

View file

@ -1 +0,0 @@
../../llama_stack/templates/remote-vllm/run-with-safety.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/remote-vllm/run.yaml

View file

@ -1,9 +0,0 @@
name: runpod
distribution_spec:
description: Use Runpod for running LLM inference
providers:
inference: remote::runpod
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -1 +0,0 @@
../../llama_stack/templates/sambanova/build.yaml

View file

@ -1,16 +0,0 @@
services:
llamastack:
image: llamastack/distribution-sambanova
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-sambanova.yaml
ports:
- "5000:5000"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1 +0,0 @@
../../llama_stack/templates/sambanova/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/tgi/build.yaml

View file

@ -1,103 +0,0 @@
services:
tgi-inference:
image: ghcr.io/huggingface/text-generation-inference:latest
volumes:
- $HOME/.cache/huggingface:/data
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
- HF_TOKEN=$HF_TOKEN
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: >
--dtype bfloat16
--usage-stats off
--sharded false
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--port ${TGI_INFERENCE_PORT:-8080}
--cuda-memory-fraction 0.75
healthcheck:
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
interval: 5s
timeout: 5s
retries: 30
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
tgi-${TGI_SAFETY_MODEL:+safety}:
image: ghcr.io/huggingface/text-generation-inference:latest
volumes:
- $HOME/.cache/huggingface:/data
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
- HF_TOKEN=$HF_TOKEN
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: >
--dtype bfloat16
--usage-stats off
--sharded false
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
--port ${TGI_SAFETY_PORT:-8081}
--cuda-memory-fraction 0.75
healthcheck:
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
interval: 5s
timeout: 5s
retries: 30
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
tgi-inference:
condition: service_healthy
tgi-${TGI_SAFETY_MODEL:+safety}:
condition: service_healthy
image: llamastack/distribution-tgi:test-0.0.52rc3
network_mode: ${NETWORK_MODE:-bridged}
volumes:
- ~/.llama:/root/.llama
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
ports:
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
# Hack: wait for TGI server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
environment:
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
volumes:
tgi-inference:
tgi-safety:
llamastack:

View file

@ -1 +0,0 @@
../../llama_stack/templates/tgi/run-with-safety.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/tgi/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/together/build.yaml

View file

@ -1,14 +0,0 @@
services:
llamastack:
image: llamastack/distribution-together
ports:
- "8321:8321"
environment:
- TOGETHER_API_KEY=${TOGETHER_API_KEY}
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1 +0,0 @@
../../llama_stack/templates/together/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/inline-vllm/build.yaml

View file

@ -1,35 +0,0 @@
services:
llamastack:
image: llamastack/distribution-inline-vllm
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/my-run.yaml
ports:
- "8321:8321"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1,66 +0,0 @@
version: '2'
image_name: local
container_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: vllm-inference
provider_type: inline::vllm
config:
model: Llama3.2-3B-Instruct
tensor_parallel_size: 1
gpu_memory_utilization: 0.4
enforce_eager: true
max_tokens: 4096
- provider_id: vllm-inference-safety
provider_type: inline::vllm
config:
model: Llama-Guard-3-1B
tensor_parallel_size: 1
gpu_memory_utilization: 0.2
enforce_eager: true
max_tokens: 4096
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
# Uncomment to use prompt guard
# - provider_id: meta1
# provider_type: inline::prompt-guard
# config:
# model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
# Uncomment to use pgvector
# - provider_id: pgvector
# provider_type: remote::pgvector
# config:
# host: 127.0.0.1
# port: 5432
# db: postgres
# user: postgres
# password: mysecretpassword
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/agents_store.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -818,14 +818,7 @@
"delete": { "delete": {
"responses": { "responses": {
"200": { "200": {
"description": "OK", "description": "OK"
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/FileResponse"
}
}
}
}, },
"400": { "400": {
"$ref": "#/components/responses/BadRequest400" "$ref": "#/components/responses/BadRequest400"
@ -2122,7 +2115,7 @@
"content": { "content": {
"application/json": { "application/json": {
"schema": { "schema": {
"$ref": "#/components/schemas/IterrowsResponse" "$ref": "#/components/schemas/PaginatedResponse"
} }
} }
} }
@ -2143,7 +2136,7 @@
"tags": [ "tags": [
"DatasetIO" "DatasetIO"
], ],
"description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.", "description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page\n- has_more: Whether there are more items available after this set",
"parameters": [ "parameters": [
{ {
"name": "dataset_id", "name": "dataset_id",
@ -2695,9 +2688,9 @@
"200": { "200": {
"description": "OK", "description": "OK",
"content": { "content": {
"application/jsonl": { "application/json": {
"schema": { "schema": {
"$ref": "#/components/schemas/ToolDef" "$ref": "#/components/schemas/ListToolDefsResponse"
} }
} }
} }
@ -4053,22 +4046,33 @@
"type": "object", "type": "object",
"properties": { "properties": {
"strategy": { "strategy": {
"$ref": "#/components/schemas/SamplingStrategy" "$ref": "#/components/schemas/SamplingStrategy",
"description": "The sampling strategy."
}, },
"max_tokens": { "max_tokens": {
"type": "integer", "type": "integer",
"default": 0 "default": 0,
"description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
}, },
"repetition_penalty": { "repetition_penalty": {
"type": "number", "type": "number",
"default": 1.0 "default": 1.0,
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
},
"stop": {
"type": "array",
"items": {
"type": "string"
},
"description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"strategy" "strategy"
], ],
"title": "SamplingParams" "title": "SamplingParams",
"description": "Sampling parameters."
}, },
"SamplingStrategy": { "SamplingStrategy": {
"oneOf": [ "oneOf": [
@ -6129,46 +6133,6 @@
"title": "FileUploadResponse", "title": "FileUploadResponse",
"description": "Response after initiating a file upload session." "description": "Response after initiating a file upload session."
}, },
"FileResponse": {
"type": "object",
"properties": {
"bucket": {
"type": "string",
"description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
},
"key": {
"type": "string",
"description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
},
"mime_type": {
"type": "string",
"description": "MIME type of the file"
},
"url": {
"type": "string",
"description": "Upload URL for the file contents"
},
"bytes": {
"type": "integer",
"description": "Size of the file in bytes"
},
"created_at": {
"type": "integer",
"description": "Timestamp of when the file was created"
}
},
"additionalProperties": false,
"required": [
"bucket",
"key",
"mime_type",
"url",
"bytes",
"created_at"
],
"title": "FileResponse",
"description": "Response representing a file entry."
},
"EmbeddingsRequest": { "EmbeddingsRequest": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -6922,6 +6886,46 @@
"title": "URIDataSource", "title": "URIDataSource",
"description": "A dataset that can be obtained from a URI." "description": "A dataset that can be obtained from a URI."
}, },
"FileResponse": {
"type": "object",
"properties": {
"bucket": {
"type": "string",
"description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
},
"key": {
"type": "string",
"description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
},
"mime_type": {
"type": "string",
"description": "MIME type of the file"
},
"url": {
"type": "string",
"description": "Upload URL for the file contents"
},
"bytes": {
"type": "integer",
"description": "Size of the file in bytes"
},
"created_at": {
"type": "integer",
"description": "Timestamp of when the file was created"
}
},
"additionalProperties": false,
"required": [
"bucket",
"key",
"mime_type",
"url",
"bytes",
"created_at"
],
"title": "FileResponse",
"description": "Response representing a file entry."
},
"Model": { "Model": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -7660,7 +7664,8 @@
"completed", "completed",
"in_progress", "in_progress",
"failed", "failed",
"scheduled" "scheduled",
"cancelled"
], ],
"title": "JobStatus" "title": "JobStatus"
}, },
@ -8068,7 +8073,7 @@
"additionalProperties": false, "additionalProperties": false,
"title": "ToolInvocationResult" "title": "ToolInvocationResult"
}, },
"IterrowsResponse": { "PaginatedResponse": {
"type": "object", "type": "object",
"properties": { "properties": {
"data": { "data": {
@ -8098,19 +8103,20 @@
] ]
} }
}, },
"description": "The rows in the current page." "description": "The list of items for the current page"
}, },
"next_start_index": { "has_more": {
"type": "integer", "type": "boolean",
"description": "Index into dataset for the first row in the next page. None if there are no more rows." "description": "Whether there are more items available after this set"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"data" "data",
"has_more"
], ],
"title": "IterrowsResponse", "title": "PaginatedResponse",
"description": "A paginated list of rows from a dataset." "description": "A generic paginated response that follows a simple format."
}, },
"Job": { "Job": {
"type": "object", "type": "object",
@ -8124,7 +8130,8 @@
"completed", "completed",
"in_progress", "in_progress",
"failed", "failed",
"scheduled" "scheduled",
"cancelled"
], ],
"title": "JobStatus" "title": "JobStatus"
} }
@ -8321,6 +8328,22 @@
], ],
"title": "ListRoutesResponse" "title": "ListRoutesResponse"
}, },
"ListToolDefsResponse": {
"type": "object",
"properties": {
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ToolDef"
}
}
},
"additionalProperties": false,
"required": [
"data"
],
"title": "ListToolDefsResponse"
},
"ListScoringFunctionsResponse": { "ListScoringFunctionsResponse": {
"type": "object", "type": "object",
"properties": { "properties": {

View file

@ -557,10 +557,6 @@ paths:
responses: responses:
'200': '200':
description: OK description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/FileResponse'
'400': '400':
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
'429': '429':
@ -1447,7 +1443,7 @@ paths:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/IterrowsResponse' $ref: '#/components/schemas/PaginatedResponse'
'400': '400':
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
'429': '429':
@ -1461,7 +1457,20 @@ paths:
tags: tags:
- DatasetIO - DatasetIO
description: >- description: >-
Get a paginated list of rows from a dataset. Uses cursor-based pagination. Get a paginated list of rows from a dataset.
Uses offset-based pagination where:
- start_index: The starting index (0-based). If None, starts from beginning.
- limit: Number of items to return. If None or -1, returns all items.
The response includes:
- data: List of items for the current page
- has_more: Whether there are more items available after this set
parameters: parameters:
- name: dataset_id - name: dataset_id
in: path in: path
@ -1846,9 +1855,9 @@ paths:
'200': '200':
description: OK description: OK
content: content:
application/jsonl: application/json:
schema: schema:
$ref: '#/components/schemas/ToolDef' $ref: '#/components/schemas/ListToolDefsResponse'
'400': '400':
$ref: '#/components/responses/BadRequest400' $ref: '#/components/responses/BadRequest400'
'429': '429':
@ -2787,16 +2796,33 @@ components:
properties: properties:
strategy: strategy:
$ref: '#/components/schemas/SamplingStrategy' $ref: '#/components/schemas/SamplingStrategy'
description: The sampling strategy.
max_tokens: max_tokens:
type: integer type: integer
default: 0 default: 0
description: >-
The maximum number of tokens that can be generated in the completion.
The token count of your prompt plus max_tokens cannot exceed the model's
context length.
repetition_penalty: repetition_penalty:
type: number type: number
default: 1.0 default: 1.0
description: >-
Number between -2.0 and 2.0. Positive values penalize new tokens based
on whether they appear in the text so far, increasing the model's likelihood
to talk about new topics.
stop:
type: array
items:
type: string
description: >-
Up to 4 sequences where the API will stop generating further tokens. The
returned text will not contain the stop sequence.
additionalProperties: false additionalProperties: false
required: required:
- strategy - strategy
title: SamplingParams title: SamplingParams
description: Sampling parameters.
SamplingStrategy: SamplingStrategy:
oneOf: oneOf:
- $ref: '#/components/schemas/GreedySamplingStrategy' - $ref: '#/components/schemas/GreedySamplingStrategy'
@ -4269,39 +4295,6 @@ components:
title: FileUploadResponse title: FileUploadResponse
description: >- description: >-
Response after initiating a file upload session. Response after initiating a file upload session.
FileResponse:
type: object
properties:
bucket:
type: string
description: >-
Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
key:
type: string
description: >-
Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
mime_type:
type: string
description: MIME type of the file
url:
type: string
description: Upload URL for the file contents
bytes:
type: integer
description: Size of the file in bytes
created_at:
type: integer
description: Timestamp of when the file was created
additionalProperties: false
required:
- bucket
- key
- mime_type
- url
- bytes
- created_at
title: FileResponse
description: Response representing a file entry.
EmbeddingsRequest: EmbeddingsRequest:
type: object type: object
properties: properties:
@ -4813,6 +4806,39 @@ components:
title: URIDataSource title: URIDataSource
description: >- description: >-
A dataset that can be obtained from a URI. A dataset that can be obtained from a URI.
FileResponse:
type: object
properties:
bucket:
type: string
description: >-
Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
key:
type: string
description: >-
Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
mime_type:
type: string
description: MIME type of the file
url:
type: string
description: Upload URL for the file contents
bytes:
type: integer
description: Size of the file in bytes
created_at:
type: integer
description: Timestamp of when the file was created
additionalProperties: false
required:
- bucket
- key
- mime_type
- url
- bytes
- created_at
title: FileResponse
description: Response representing a file entry.
Model: Model:
type: object type: object
properties: properties:
@ -5289,6 +5315,7 @@ components:
- in_progress - in_progress
- failed - failed
- scheduled - scheduled
- cancelled
title: JobStatus title: JobStatus
scheduled_at: scheduled_at:
type: string type: string
@ -5528,7 +5555,7 @@ components:
- type: object - type: object
additionalProperties: false additionalProperties: false
title: ToolInvocationResult title: ToolInvocationResult
IterrowsResponse: PaginatedResponse:
type: object type: object
properties: properties:
data: data:
@ -5543,17 +5570,18 @@ components:
- type: string - type: string
- type: array - type: array
- type: object - type: object
description: The rows in the current page. description: The list of items for the current page
next_start_index: has_more:
type: integer type: boolean
description: >- description: >-
Index into dataset for the first row in the next page. None if there are Whether there are more items available after this set
no more rows.
additionalProperties: false additionalProperties: false
required: required:
- data - data
title: IterrowsResponse - has_more
description: A paginated list of rows from a dataset. title: PaginatedResponse
description: >-
A generic paginated response that follows a simple format.
Job: Job:
type: object type: object
properties: properties:
@ -5566,6 +5594,7 @@ components:
- in_progress - in_progress
- failed - failed
- scheduled - scheduled
- cancelled
title: JobStatus title: JobStatus
additionalProperties: false additionalProperties: false
required: required:
@ -5703,6 +5732,17 @@ components:
required: required:
- data - data
title: ListRoutesResponse title: ListRoutesResponse
ListToolDefsResponse:
type: object
properties:
data:
type: array
items:
$ref: '#/components/schemas/ToolDef'
additionalProperties: false
required:
- data
title: ListToolDefsResponse
ListScoringFunctionsResponse: ListScoringFunctionsResponse:
type: object type: object
properties: properties:

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

File diff suppressed because one or more lines are too long

View file

@ -963,16 +963,19 @@
"\n", "\n",
"client.benchmarks.register(\n", "client.benchmarks.register(\n",
" benchmark_id=\"meta-reference::mmmu\",\n", " benchmark_id=\"meta-reference::mmmu\",\n",
" # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
" # `input_rows` argument and does not fetch data from the dataset.\n",
" dataset_id=f\"mmmu-{subset}-{split}\",\n", " dataset_id=f\"mmmu-{subset}-{split}\",\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
" scoring_functions=[],\n",
")\n", ")\n",
"\n", "\n",
"response = client.eval.evaluate_rows_alpha(\n", "response = client.eval.evaluate_rows(\n",
" benchmark_id=\"meta-reference::mmmu\",\n", " benchmark_id=\"meta-reference::mmmu\",\n",
" input_rows=eval_rows,\n", " input_rows=eval_rows,\n",
" # Note: Here we define the actual scoring functions.\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
" benchmark_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"model\",\n", " \"type\": \"model\",\n",
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n", " \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1139,12 +1142,11 @@
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
")\n", ")\n",
"\n", "\n",
"response = client.eval.evaluate_rows_alpha(\n", "response = client.eval.evaluate_rows(\n",
" benchmark_id=\"meta-reference::simpleqa\",\n", " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.data,\n", " input_rows=eval_rows.data,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" benchmark_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"model\",\n", " \"type\": \"model\",\n",
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n", " \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1288,12 +1290,11 @@
" \"enable_session_persistence\": False,\n", " \"enable_session_persistence\": False,\n",
"}\n", "}\n",
"\n", "\n",
"response = client.eval.evaluate_rows_alpha(\n", "response = client.eval.evaluate_rows(\n",
" benchmark_id=\"meta-reference::simpleqa\",\n", " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.data,\n", " input_rows=eval_rows.data,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" benchmark_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"agent\",\n", " \"type\": \"agent\",\n",
" \"config\": agent_config,\n", " \"config\": agent_config,\n",

View file

@ -21,7 +21,7 @@ from llama_stack.distribution.stack import LlamaStack # noqa: E402
from .pyopenapi.options import Options # noqa: E402 from .pyopenapi.options import Options # noqa: E402
from .pyopenapi.specification import Info, Server # noqa: E402 from .pyopenapi.specification import Info, Server # noqa: E402
from .pyopenapi.utility import Specification, validate_api_method_return_types # noqa: E402 from .pyopenapi.utility import Specification, validate_api # noqa: E402
def str_presenter(dumper, data): def str_presenter(dumper, data):
@ -40,8 +40,7 @@ def main(output_dir: str):
raise ValueError(f"Directory {output_dir} does not exist") raise ValueError(f"Directory {output_dir} does not exist")
# Validate API protocols before generating spec # Validate API protocols before generating spec
print("Validating API method return types...") return_type_errors = validate_api()
return_type_errors = validate_api_method_return_types()
if return_type_errors: if return_type_errors:
print("\nAPI Method Return Type Validation Errors:\n") print("\nAPI Method Return Type Validation Errors:\n")
for error in return_type_errors: for error in return_type_errors:

View file

@ -7,10 +7,9 @@
import json import json
import typing import typing
import inspect import inspect
import os
from pathlib import Path from pathlib import Path
from typing import TextIO from typing import TextIO
from typing import Any, Dict, List, Optional, Protocol, Type, Union, get_type_hints, get_origin, get_args from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
from llama_stack.strong_typing.schema import object_to_json, StrictJsonType from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
from llama_stack.distribution.resolver import api_protocol_map from llama_stack.distribution.resolver import api_protocol_map
@ -125,29 +124,89 @@ def is_optional_type(type_: Any) -> bool:
return origin is Optional or (origin is Union and type(None) in args) return origin is Optional or (origin is Union and type(None) in args)
def validate_api_method_return_types() -> List[str]: def _validate_api_method_return_type(method) -> str | None:
"""Validate that all API methods have proper return types.""" hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if is_optional_type(return_type):
return "returns Optional type where a return value is mandatory"
def _validate_api_method_doesnt_return_list(method) -> str | None:
hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if get_origin(return_type) is list:
return "returns a list where a PaginatedResponse or List*Response object is expected"
def _validate_api_delete_method_returns_none(method) -> str | None:
hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if return_type is not None and return_type is not type(None):
return "does not return None where None is mandatory"
def _validate_list_parameters_contain_data(method) -> str | None:
hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if not inspect.isclass(return_type):
return
if not return_type.__name__.startswith('List'):
return
if 'data' not in return_type.model_fields:
return "does not have a mandatory data attribute containing the list of objects"
_VALIDATORS = {
"GET": [
_validate_api_method_return_type,
_validate_list_parameters_contain_data,
_validate_api_method_doesnt_return_list,
],
"DELETE": [
_validate_api_delete_method_returns_none,
],
}
def _get_methods_by_type(protocol, method_type: str):
members = inspect.getmembers(protocol, predicate=inspect.isfunction)
return {
method_name: method
for method_name, method in members
if (webmethod := getattr(method, '__webmethod__', None))
if webmethod and webmethod.method == method_type
}
def validate_api() -> List[str]:
"""Validate the API protocols."""
errors = [] errors = []
protocols = api_protocol_map() protocols = api_protocol_map()
for protocol_name, protocol in protocols.items(): for target, validators in _VALIDATORS.items():
methods = inspect.getmembers(protocol, predicate=inspect.isfunction) for protocol_name, protocol in protocols.items():
for validator in validators:
for method_name, method in methods: for method_name, method in _get_methods_by_type(protocol, target).items():
if not hasattr(method, '__webmethod__'): err = validator(method)
continue if err:
errors.append(f"Method {protocol_name}.{method_name} {err}")
# Only check GET methods
if method.__webmethod__.method != "GET":
continue
hints = get_type_hints(method)
if 'return' not in hints:
errors.append(f"Method {protocol_name}.{method_name} has no return type annotation")
else:
return_type = hints['return']
if is_optional_type(return_type):
errors.append(f"Method {protocol_name}.{method_name} returns Optional type")
return errors return errors

View file

@ -1,4 +1,4 @@
# Building AI Applications # Building AI Applications (Examples)
Llama Stack provides all the building blocks needed to create sophisticated AI applications. Llama Stack provides all the building blocks needed to create sophisticated AI applications.

View file

@ -1,4 +1,4 @@
## Using Retrieval Augmented Generation (RAG) ## Retrieval Augmented Generation (RAG)
RAG enables your applications to reference and recall information from previous interactions or external documents. RAG enables your applications to reference and recall information from previous interactions or external documents.

View file

@ -45,14 +45,16 @@ Here's an example that sends telemetry signals to all three sink types. Your con
- provider_id: meta-reference - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: config:
sinks: ['console', 'sqlite', 'otel'] sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
otel_endpoint: "http://localhost:4318/v1/traces" otel_trace_endpoint: "http://localhost:4318/v1/traces"
otel_metric_endpoint: "http://localhost:4318/v1/metrics"
sqlite_db_path: "/path/to/telemetry.db" sqlite_db_path: "/path/to/telemetry.db"
``` ```
### Jaeger to visualize traces ### Jaeger to visualize traces
The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data. The `otel` sink works with any service compatible with the OpenTelemetry collector, traces and metrics has two separate endpoints.
Let's use Jaeger to visualize this data.
Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command: Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:

View file

@ -16,6 +16,7 @@ from docutils import nodes
from pathlib import Path from pathlib import Path
import requests import requests
import json import json
from datetime import datetime
# Read version from pyproject.toml # Read version from pyproject.toml
with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f: with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
@ -28,7 +29,7 @@ with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") a
llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>" llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"
project = "llama-stack" project = "llama-stack"
copyright = "2025, Meta" copyright = f"{datetime.now().year}, Meta"
author = "Meta" author = "Meta"
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
@ -37,6 +38,7 @@ author = "Meta"
extensions = [ extensions = [
"myst_parser", "myst_parser",
"sphinx_rtd_theme", "sphinx_rtd_theme",
"sphinx_rtd_dark_mode",
"sphinx_copybutton", "sphinx_copybutton",
"sphinx_tabs.tabs", "sphinx_tabs.tabs",
"sphinx_design", "sphinx_design",
@ -103,6 +105,8 @@ source_suffix = {
# html_theme = "alabaster" # html_theme = "alabaster"
html_theme_options = { html_theme_options = {
"canonical_url": "https://github.com/meta-llama/llama-stack", "canonical_url": "https://github.com/meta-llama/llama-stack",
'collapse_navigation': False,
# "style_nav_header_background": "#c3c9d4", # "style_nav_header_background": "#c3c9d4",
} }

View file

@ -1,14 +1,14 @@
# Contributing to Llama Stack
Start with the [Contributing Guide](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md) for some general tips. This section covers a few key topics in more detail. ```{include} ../../../CONTRIBUTING.md
```
See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
- [Adding a New API Provider](new_api_provider.md) describes adding new API providers to the Stack.
- [Testing Llama Stack](testing.md) provides details about the testing framework and how to test providers and distributions.
```{toctree} ```{toctree}
:maxdepth: 1 :maxdepth: 1
:hidden: :hidden:
new_api_provider new_api_provider
testing
``` ```

View file

@ -67,7 +67,7 @@ options:
Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default: Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
conda) conda)
--image-name IMAGE_NAME --image-name IMAGE_NAME
[for image-type=conda|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if [for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
found. (default: None) found. (default: None)
--print-deps-only Print the dependencies for the stack only, without building the stack (default: False) --print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
--run Run the stack after building using the same image type, name, and other applicable arguments (default: False) --run Run the stack after building using the same image type, name, and other applicable arguments (default: False)

View file

@ -1,4 +1,4 @@
# Configuring a Stack # Configuring a "Stack"
The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution: The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:

View file

@ -1,10 +1,12 @@
# Using Llama Stack as a Library # Using Llama Stack as a Library
If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server. ## Setup Llama Stack without a Server
If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library.
This avoids the overhead of setting up a server.
```bash ```bash
# setup # setup
uv pip install llama-stack uv pip install llama-stack
llama stack build --template together --image-type venv llama stack build --template ollama --image-type venv
``` ```
```python ```python

View file

@ -1,34 +1,18 @@
# Starting a Llama Stack Server # Distributions Overview
You can run a Llama Stack server in one of the following ways: A distribution is a pre-packaged set of Llama Stack components that can be deployed together.
**As a Library**:
This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
**Container**:
Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
**Conda**:
If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
**Kubernetes**:
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
This section provides an overview of the distributions available in Llama Stack.
```{toctree} ```{toctree}
:maxdepth: 1 :maxdepth: 3
:hidden:
importing_as_library importing_as_library
building_distro
configuration configuration
selection list_of_distributions
kubernetes_deployment kubernetes_deployment
building_distro
on_device_distro
remote_hosted_distro
self_hosted_distro
``` ```

View file

@ -1,6 +1,9 @@
# Kubernetes Deployment Guide # Kubernetes Deployment Guide
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes. Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
### Prerequisites
In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
First, create a local Kubernetes cluster via Kind: First, create a local Kubernetes cluster via Kind:
@ -8,7 +11,7 @@ First, create a local Kubernetes cluster via Kind:
kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
``` ```
Start vLLM server as a Kubernetes Pod and Service: First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
```bash ```bash
cat <<EOF |kubectl apply -f - cat <<EOF |kubectl apply -f -
@ -31,7 +34,13 @@ metadata:
type: Opaque type: Opaque
data: data:
token: $(HF_TOKEN) token: $(HF_TOKEN)
--- ```
Next, start the vLLM server as a Kubernetes Deployment and Service:
```bash
cat <<EOF |kubectl apply -f -
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
@ -47,28 +56,23 @@ spec:
app.kubernetes.io/name: vllm app.kubernetes.io/name: vllm
spec: spec:
containers: containers:
- name: llama-stack - name: vllm
image: $(VLLM_IMAGE) image: vllm/vllm-openai:latest
command: command: ["/bin/sh", "-c"]
- bash args: [
- -c "vllm serve meta-llama/Llama-3.2-1B-Instruct"
- | ]
MODEL="meta-llama/Llama-3.2-1B-Instruct" env:
MODEL_PATH=/app/model/$(basename $MODEL) - name: HUGGING_FACE_HUB_TOKEN
huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN valueFrom:
huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH secretKeyRef:
python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000 name: hf-token-secret
key: token
ports: ports:
- containerPort: 8000 - containerPort: 8000
volumeMounts: volumeMounts:
- name: llama-storage - name: llama-storage
mountPath: /app/model mountPath: /root/.cache/huggingface
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumes: volumes:
- name: llama-storage - name: llama-storage
persistentVolumeClaim: persistentVolumeClaim:
@ -127,6 +131,7 @@ EOF
podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
``` ```
### Deploying Llama Stack Server in Kubernetes
We can then start the Llama Stack server by deploying a Kubernetes Pod and Service: We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:
@ -187,6 +192,7 @@ spec:
EOF EOF
``` ```
### Verifying the Deployment
We can check that the LlamaStack server has started: We can check that the LlamaStack server has started:
```bash ```bash

View file

@ -1,4 +1,4 @@
# List of Distributions # Available List of Distributions
Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box. Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box.

View file

@ -9,6 +9,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
| datasetio | `inline::localfs` | | datasetio | `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `remote::nvidia` | | inference | `remote::nvidia` |
| post_training | `remote::nvidia` |
| safety | `remote::nvidia` | | safety | `remote::nvidia` |
| scoring | `inline::basic` | | scoring | `inline::basic` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
@ -21,6 +22,12 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
The following environment variables can be configured: The following environment variables can be configured:
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``) - `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`) - `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`) - `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`) - `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)

View file

@ -98,11 +98,14 @@ export INFERENCE_PORT=8000
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export LLAMA_STACK_PORT=8321 export LLAMA_STACK_PORT=8321
# You need a local checkout of llama-stack to run this, get it using
# git clone https://github.com/meta-llama/llama-stack.git
cd /path/to/llama-stack
docker run \ docker run \
-it \
--pull always \ --pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \ -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
llamastack/distribution-remote-vllm \ llamastack/distribution-remote-vllm \
--yaml-config /root/my-run.yaml \ --yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
@ -121,7 +124,6 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
cd /path/to/llama-stack cd /path/to/llama-stack
docker run \ docker run \
-it \
--pull always \ --pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \ -v ~/.llama:/root/.llama \

View file

@ -0,0 +1,32 @@
# Starting a Llama Stack Server
You can run a Llama Stack server in one of the following ways:
**As a Library**:
This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
**Container**:
Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
**Conda**:
If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
**Kubernetes**:
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
```{toctree}
:maxdepth: 1
:hidden:
importing_as_library
configuration
kubernetes_deployment
```

View file

@ -1,10 +1,11 @@
# Quick Start # Quick Start
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple RAG agent. In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to build a simple [RAG (Retrieval Augmented Generation)](../building_applications/rag.md) agent.
A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions. A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.
In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider. In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
Ollama is an LLM runtime that allows you to run Llama models locally.
### 1. Start Ollama ### 1. Start Ollama
@ -24,7 +25,7 @@ If you do not have ollama, you can install it from [here](https://ollama.com/dow
### 2. Pick a client environment ### 2. Pick a client environment
Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through an REST interface. You can interact with the Stack in two ways: Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through a REST interface. You can interact with the Stack in two ways:
* Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server. * Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server.
* Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`. * Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`.

View file

@ -6,6 +6,7 @@ Llama Stack {{ llama_stack_version }} is now available! See the {{ llama_stack_v
# Llama Stack # Llama Stack
## What is Llama Stack?
Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides
@ -22,6 +23,12 @@ Llama Stack defines and standardizes the core building blocks needed to bring ge
Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available. Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.
## How does Llama Stack work?
Llama Stack consists of a [server](./distributions/index.md) (with multiple pluggable API [providers](./providers/index.md)) and [client SDKs](#available-sdks) meant to
be used in your applications. The server can be run in a variety of environments, including local (inline)
development, on-premises, and cloud. The client SDKs are available for Python, Swift, Node, and
Kotlin.
## Quick Links ## Quick Links
- New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision. - New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
@ -93,7 +100,6 @@ getting_started/index
concepts/index concepts/index
providers/index providers/index
distributions/index distributions/index
distributions/selection
building_applications/index building_applications/index
playground/index playground/index
contributing/index contributing/index

View file

@ -92,8 +92,6 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
## Starting the Llama Stack Playground ## Starting the Llama Stack Playground
### Llama CLI
To start the Llama Stack Playground, run the following commands: To start the Llama Stack Playground, run the following commands:
1. Start up the Llama Stack API server 1. Start up the Llama Stack API server
@ -109,29 +107,3 @@ cd llama_stack/distribution/ui
pip install -r requirements.txt pip install -r requirements.txt
streamlit run app.py streamlit run app.py
``` ```
### Docker
Playground can also be started in a docker image:
```sh
export LLAMA_STACK_URL=http://localhost:11434
docker run \
--pull always \
-p 8501:8501 \
-e LLAMA_STACK_ENDPOINT=$LLAMA_STACK_URL \
quay.io/jland/llama-stack-playground
```
## Configurable Environment Variables
## Environment Variables
| Environment Variable | Description | Default Value |
|----------------------------|------------------------------------|---------------------------|
| LLAMA_STACK_ENDPOINT | The endpoint for the Llama Stack | http://localhost:8321 |
| FIREWORKS_API_KEY | API key for Fireworks provider | (empty string) |
| TOGETHER_API_KEY | API key for Together provider | (empty string) |
| SAMBANOVA_API_KEY | API key for SambaNova provider | (empty string) |
| OPENAI_API_KEY | API key for OpenAI provider | (empty string) |

View file

@ -10,11 +10,57 @@ That means you're not limited to storing vectors in memory or in a separate serv
## Features ## Features
- Lightweight and easy to use - Lightweight and easy to use
- Fully integrated with Llama Stack - Fully integrated with Llama Stacks
- Uses disk-based storage for persistence, allowing for larger vector storage
### Comparison to Faiss
The choice between Faiss and sqlite-vec should be made based on the needs of your application,
as they have different strengths.
#### Choosing the Right Provider
Scenario | Recommended Tool | Reason
-- |-----------------| --
Online Analytical Processing (OLAP) | Faiss | Fast, in-memory searches
Online Transaction Processing (OLTP) | sqlite-vec | Frequent writes and reads
Frequent writes | sqlite-vec | Efficient disk-based storage and incremental indexing
Large datasets | sqlite-vec | Disk-based storage for larger vector storage
Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
#### Empirical Example
Consider the histogram below in which 10,000 randomly generated strings were inserted
in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
:alt: Comparison of SQLite-Vec and Faiss write times
:width: 400px
```
You will notice that the average write time for `sqlite-vec` was 788ms, compared to
47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
uniformly spread across the [1500, 100000] interval.
Looking at each individual write in the order that the documents are inserted you'll see the increase in
write speed as Faiss reindexes the vectors after each write.
```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
:alt: Comparison of SQLite-Vec and Faiss write times
:width: 400px
```
In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
The modes of the two distributions highlight the differences much further where Faiss
will likely yield faster read performance.
```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
:alt: Comparison of SQLite-Vec and Faiss read times
:width: 400px
```
## Usage ## Usage
To use SQLite-Vec in your Llama Stack project, follow these steps: To use sqlite-vec in your Llama Stack project, follow these steps:
1. Install the necessary dependencies. 1. Install the necessary dependencies.
2. Configure your Llama Stack project to use SQLite-Vec. 2. Configure your Llama Stack project to use SQLite-Vec.

View file

@ -15,6 +15,7 @@ class JobStatus(Enum):
in_progress = "in_progress" in_progress = "in_progress"
failed = "failed" failed = "failed"
scheduled = "scheduled" scheduled = "scheduled"
cancelled = "cancelled"
@json_schema_type @json_schema_type

View file

@ -0,0 +1,23 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List
from pydantic import BaseModel
from llama_stack.schema_utils import json_schema_type
@json_schema_type
class PaginatedResponse(BaseModel):
"""A generic paginated response that follows a simple format.
:param data: The list of items for the current page
:param has_more: Whether there are more items available after this set
"""
data: List[Dict[str, Any]]
has_more: bool

View file

@ -6,23 +6,9 @@
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
from pydantic import BaseModel from llama_stack.apis.common.responses import PaginatedResponse
from llama_stack.apis.datasets import Dataset from llama_stack.apis.datasets import Dataset
from llama_stack.schema_utils import json_schema_type, webmethod from llama_stack.schema_utils import webmethod
@json_schema_type
class IterrowsResponse(BaseModel):
"""
A paginated list of rows from a dataset.
:param data: The rows in the current page.
:param next_start_index: Index into dataset for the first row in the next page. None if there are no more rows.
"""
data: List[Dict[str, Any]]
next_start_index: Optional[int] = None
class DatasetStore(Protocol): class DatasetStore(Protocol):
@ -34,15 +20,22 @@ class DatasetIO(Protocol):
# keeping for aligning with inference/safety, but this is not used # keeping for aligning with inference/safety, but this is not used
dataset_store: DatasetStore dataset_store: DatasetStore
# TODO(xiyan): there's a flakiness here where setting route to "/datasets/" here will not result in proper routing
@webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET") @webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET")
async def iterrows( async def iterrows(
self, self,
dataset_id: str, dataset_id: str,
start_index: Optional[int] = None, start_index: Optional[int] = None,
limit: Optional[int] = None, limit: Optional[int] = None,
) -> IterrowsResponse: ) -> PaginatedResponse:
"""Get a paginated list of rows from a dataset. Uses cursor-based pagination. """Get a paginated list of rows from a dataset.
Uses offset-based pagination where:
- start_index: The starting index (0-based). If None, starts from beginning.
- limit: Number of items to return. If None or -1, returns all items.
The response includes:
- data: List of items for the current page
- has_more: Whether there are more items available after this set
:param dataset_id: The ID of the dataset to get the rows from. :param dataset_id: The ID of the dataset to get the rows from.
:param start_index: Index into dataset for the first row to get. Get all rows if None. :param start_index: Index into dataset for the first row to get. Get all rows if None.

View file

@ -34,6 +34,7 @@ class Api(Enum):
scoring_functions = "scoring_functions" scoring_functions = "scoring_functions"
benchmarks = "benchmarks" benchmarks = "benchmarks"
tool_groups = "tool_groups" tool_groups = "tool_groups"
files = "files"
# built-in API # built-in API
inspect = "inspect" inspect = "inspect"

View file

@ -164,7 +164,7 @@ class Files(Protocol):
self, self,
bucket: str, bucket: str,
key: str, key: str,
) -> FileResponse: ) -> None:
""" """
Delete a file identified by a bucket and key. Delete a file identified by a bucket and key.

View file

@ -88,6 +88,10 @@ class ListToolsResponse(BaseModel):
data: List[Tool] data: List[Tool]
class ListToolDefsResponse(BaseModel):
data: list[ToolDef]
@runtime_checkable @runtime_checkable
@trace_protocol @trace_protocol
class ToolGroups(Protocol): class ToolGroups(Protocol):
@ -148,7 +152,7 @@ class ToolRuntime(Protocol):
@webmethod(route="/tool-runtime/list-tools", method="GET") @webmethod(route="/tool-runtime/list-tools", method="GET")
async def list_runtime_tools( async def list_runtime_tools(
self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
) -> List[ToolDef]: ... ) -> ListToolDefsResponse: ...
@webmethod(route="/tool-runtime/invoke", method="POST") @webmethod(route="/tool-runtime/invoke", method="POST")
async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult: async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:

View file

@ -21,6 +21,7 @@ from prompt_toolkit.completion import WordCompleter
from prompt_toolkit.validation import Validator from prompt_toolkit.validation import Validator
from termcolor import cprint from termcolor import cprint
from llama_stack.cli.stack.utils import ImageType
from llama_stack.cli.table import print_table from llama_stack.cli.table import print_table
from llama_stack.distribution.build import ( from llama_stack.distribution.build import (
SERVER_DEPENDENCIES, SERVER_DEPENDENCIES,
@ -62,10 +63,10 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
if args.list_templates: if args.list_templates:
return _run_template_list_cmd() return _run_template_list_cmd()
if args.image_type == "venv": if args.image_type == ImageType.VENV.value:
current_venv = os.environ.get("VIRTUAL_ENV") current_venv = os.environ.get("VIRTUAL_ENV")
image_name = args.image_name or current_venv image_name = args.image_name or current_venv
elif args.image_type == "conda": elif args.image_type == ImageType.CONDA.value:
current_conda_env = os.environ.get("CONDA_DEFAULT_ENV") current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
image_name = args.image_name or current_conda_env image_name = args.image_name or current_conda_env
else: else:
@ -84,7 +85,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
build_config.image_type = args.image_type build_config.image_type = args.image_type
else: else:
cprint( cprint(
f"Please specify a image-type (container | conda | venv) for {args.template}", f"Please specify a image-type ({' | '.join(e.value for e in ImageType)}) for {args.template}",
color="red", color="red",
) )
sys.exit(1) sys.exit(1)
@ -98,15 +99,15 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
) )
image_type = prompt( image_type = prompt(
"> Enter the image type you want your Llama Stack to be built as (container or conda or venv): ", f"> Enter the image type you want your Llama Stack to be built as ({' or '.join(e.value for e in ImageType)}): ",
validator=Validator.from_callable( validator=Validator.from_callable(
lambda x: x in ["container", "conda", "venv"], lambda x: x in [e.value for e in ImageType],
error_message="Invalid image type, please enter conda or container or venv", error_message=f"Invalid image type, please enter {' or '.join(e.value for e in ImageType)}",
), ),
default="conda", default=ImageType.CONDA.value,
) )
if image_type == "conda": if image_type == ImageType.CONDA.value:
if not image_name: if not image_name:
cprint( cprint(
f"No current conda environment detected or specified, will create a new conda environment with the name `llamastack-{name}`", f"No current conda environment detected or specified, will create a new conda environment with the name `llamastack-{name}`",
@ -136,6 +137,8 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
providers = dict() providers = dict()
for api, providers_for_api in get_provider_registry().items(): for api, providers_for_api in get_provider_registry().items():
available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")] available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
if not available_providers:
continue
api_provider = prompt( api_provider = prompt(
"> Enter provider for API {}: ".format(api.value), "> Enter provider for API {}: ".format(api.value),
completer=WordCompleter(available_providers), completer=WordCompleter(available_providers),

View file

@ -6,6 +6,7 @@
import argparse import argparse
import textwrap import textwrap
from llama_stack.cli.stack.utils import ImageType
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
@ -46,16 +47,16 @@ class StackBuild(Subcommand):
self.parser.add_argument( self.parser.add_argument(
"--image-type", "--image-type",
type=str, type=str,
help="Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config.", help="Image Type to use for the build. If not specified, will use the image type from the template config.",
choices=["conda", "container", "venv"], choices=[e.value for e in ImageType],
default="conda", default=ImageType.CONDA.value,
) )
self.parser.add_argument( self.parser.add_argument(
"--image-name", "--image-name",
type=str, type=str,
help=textwrap.dedent( help=textwrap.dedent(
"""[for image-type=conda|venv] Name of the conda or virtual environment to use for f"""[for image-type={"|".join(e.value for e in ImageType)}] Name of the conda or virtual environment to use for
the build. If not specified, currently active Conda environment will be used if found. the build. If not specified, currently active Conda environment will be used if found.
""" """
), ),

View file

@ -8,6 +8,7 @@ import argparse
import os import os
from pathlib import Path from pathlib import Path
from llama_stack.cli.stack.utils import ImageType
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
from llama_stack.log import get_logger from llama_stack.log import get_logger
@ -56,7 +57,6 @@ class StackRun(Subcommand):
"--env", "--env",
action="append", action="append",
help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.", help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
default=[],
metavar="KEY=VALUE", metavar="KEY=VALUE",
) )
self.parser.add_argument( self.parser.add_argument(
@ -73,10 +73,24 @@ class StackRun(Subcommand):
"--image-type", "--image-type",
type=str, type=str,
help="Image Type used during the build. This can be either conda or container or venv.", help="Image Type used during the build. This can be either conda or container or venv.",
choices=["conda", "container", "venv"], choices=[e.value for e in ImageType],
default="conda",
) )
# If neither image type nor image name is provided, but at the same time
# the current environment has conda breadcrumbs, then assume what the user
# wants to use conda mode and not the usual default mode (using
# pre-installed system packages).
#
# Note: yes, this is hacky. It's implemented this way to keep the existing
# conda users unaffected by the switch of the default behavior to using
# system packages.
def _get_image_type_and_name(self, args: argparse.Namespace) -> tuple[str, str]:
conda_env = os.environ.get("CONDA_DEFAULT_ENV")
if conda_env and args.image_name == conda_env:
logger.warning(f"Conda detected. Using conda environment {conda_env} for the run.")
return ImageType.CONDA.value, args.image_name
return args.image_type, args.image_name
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None: def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
import yaml import yaml
@ -120,20 +134,44 @@ class StackRun(Subcommand):
except AttributeError as e: except AttributeError as e:
self.parser.error(f"failed to parse config file '{config_file}':\n {e}") self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
run_args = formulate_run_args(args.image_type, args.image_name, config, template_name) image_type, image_name = self._get_image_type_and_name(args)
run_args.extend([str(config_file), str(args.port)]) # If neither image type nor image name is provided, assume the server should be run directly
if args.disable_ipv6: # using the current environment packages.
run_args.append("--disable-ipv6") if not image_type and not image_name:
logger.info("No image type or image name provided. Assuming environment packages.")
from llama_stack.distribution.server.server import main as server_main
for env_var in args.env: # Build the server args from the current args passed to the CLI
if "=" not in env_var: server_args = argparse.Namespace()
self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format") for arg in vars(args):
key, value = env_var.split("=", 1) # split on first = only # If this is a function, avoid passing it
if not key: # "args" contains:
self.parser.error(f"Environment variable '{env_var}' has empty key") # func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
run_args.extend(["--env", f"{key}={value}"]) if callable(getattr(args, arg)):
continue
setattr(server_args, arg, getattr(args, arg))
if args.tls_keyfile and args.tls_certfile: # Run the server
run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile]) server_main(server_args)
run_command(run_args) else:
run_args = formulate_run_args(image_type, image_name, config, template_name)
run_args.extend([str(config_file), str(args.port)])
if args.disable_ipv6:
run_args.append("--disable-ipv6")
if args.env:
for env_var in args.env:
if "=" not in env_var:
self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
return
key, value = env_var.split("=", 1) # split on first = only
if not key:
self.parser.error(f"Environment variable '{env_var}' has empty key")
return
run_args.extend(["--env", f"{key}={value}"])
if args.tls_keyfile and args.tls_certfile:
run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
run_command(run_args)

View file

@ -4,6 +4,14 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from enum import Enum
class ImageType(Enum):
CONDA = "conda"
CONTAINER = "container"
VENV = "venv"
def print_subcommand_description(parser, subparsers): def print_subcommand_description(parser, subparsers):
"""Print descriptions of subcommands.""" """Print descriptions of subcommands."""

View file

@ -328,8 +328,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
body = self._convert_body(path, options.method, body) body = self._convert_body(path, options.method, body)
await start_trace(route, {"__location__": "library_client"})
async def gen(): async def gen():
await start_trace(route, {"__location__": "library_client"})
try: try:
async for chunk in await func(**body): async for chunk in await func(**body):
data = json.dumps(convert_pydantic_to_json_value(chunk)) data = json.dumps(convert_pydantic_to_json_value(chunk))

View file

@ -12,6 +12,7 @@ from llama_stack.apis.benchmarks import Benchmarks
from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Datasets from llama_stack.apis.datasets import Datasets
from llama_stack.apis.eval import Eval from llama_stack.apis.eval import Eval
from llama_stack.apis.files import Files
from llama_stack.apis.inference import Inference from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect from llama_stack.apis.inspect import Inspect
from llama_stack.apis.models import Models from llama_stack.apis.models import Models
@ -79,6 +80,7 @@ def api_protocol_map() -> Dict[Api, Any]:
Api.post_training: PostTraining, Api.post_training: PostTraining,
Api.tool_groups: ToolGroups, Api.tool_groups: ToolGroups,
Api.tool_runtime: ToolRuntime, Api.tool_runtime: ToolRuntime,
Api.files: Files,
} }

View file

@ -12,7 +12,8 @@ from llama_stack.apis.common.content_types import (
InterleavedContent, InterleavedContent,
InterleavedContentItem, InterleavedContentItem,
) )
from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse from llama_stack.apis.common.responses import PaginatedResponse
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import DatasetPurpose, DataSource from llama_stack.apis.datasets import DatasetPurpose, DataSource
from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
@ -45,11 +46,11 @@ from llama_stack.apis.scoring import (
from llama_stack.apis.shields import Shield from llama_stack.apis.shields import Shield
from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
from llama_stack.apis.tools import ( from llama_stack.apis.tools import (
ListToolDefsResponse,
RAGDocument, RAGDocument,
RAGQueryConfig, RAGQueryConfig,
RAGQueryResult, RAGQueryResult,
RAGToolRuntime, RAGToolRuntime,
ToolDef,
ToolRuntime, ToolRuntime,
) )
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
@ -497,7 +498,7 @@ class DatasetIORouter(DatasetIO):
dataset_id: str, dataset_id: str,
start_index: Optional[int] = None, start_index: Optional[int] = None,
limit: Optional[int] = None, limit: Optional[int] = None,
) -> IterrowsResponse: ) -> PaginatedResponse:
logger.debug( logger.debug(
f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}", f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}",
) )
@ -706,6 +707,6 @@ class ToolRuntimeRouter(ToolRuntime):
async def list_runtime_tools( async def list_runtime_tools(
self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
) -> List[ToolDef]: ) -> ListToolDefsResponse:
logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}") logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint) return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint)

View file

@ -568,7 +568,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint) tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint)
tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
for tool_def in tool_defs: for tool_def in tool_defs.data:
tools.append( tools.append(
ToolWithACL( ToolWithACL(
identifier=tool_def.name, identifier=tool_def.name,

View file

@ -15,7 +15,7 @@ import warnings
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from importlib.metadata import version as parse_version from importlib.metadata import version as parse_version
from pathlib import Path from pathlib import Path
from typing import Any, List, Union from typing import Any, List, Optional, Union
import yaml import yaml
from fastapi import Body, FastAPI, HTTPException, Request from fastapi import Body, FastAPI, HTTPException, Request
@ -294,11 +294,17 @@ class ClientVersionMiddleware:
return await self.app(scope, receive, send) return await self.app(scope, receive, send)
def main(): def main(args: Optional[argparse.Namespace] = None):
"""Start the LlamaStack server.""" """Start the LlamaStack server."""
parser = argparse.ArgumentParser(description="Start the LlamaStack server.") parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
parser.add_argument( parser.add_argument(
"--yaml-config", "--yaml-config",
dest="config",
help="(Deprecated) Path to YAML configuration file - use --config instead",
)
parser.add_argument(
"--config",
dest="config",
help="Path to YAML configuration file", help="Path to YAML configuration file",
) )
parser.add_argument( parser.add_argument(
@ -328,12 +334,24 @@ def main():
required="--tls-keyfile" in sys.argv, required="--tls-keyfile" in sys.argv,
) )
args = parser.parse_args() # Determine whether the server args are being passed by the "run" command, if this is the case
# the args will be passed as a Namespace object to the main function, otherwise they will be
# parsed from the command line
if args is None:
args = parser.parse_args()
# Check for deprecated argument usage
if "--yaml-config" in sys.argv:
warnings.warn(
"The '--yaml-config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
DeprecationWarning,
stacklevel=2,
)
log_line = "" log_line = ""
if args.yaml_config: if args.config:
# if the user provided a config file, use it, even if template was specified # if the user provided a config file, use it, even if template was specified
config_file = Path(args.yaml_config) config_file = Path(args.config)
if not config_file.exists(): if not config_file.exists():
raise ValueError(f"Config file {config_file} does not exist") raise ValueError(f"Config file {config_file} does not exist")
log_line = f"Using config file: {config_file}" log_line = f"Using config file: {config_file}"

View file

@ -13,6 +13,7 @@ LLAMA_CHECKPOINT_DIR=${LLAMA_CHECKPOINT_DIR:-}
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
PYPI_VERSION=${PYPI_VERSION:-} PYPI_VERSION=${PYPI_VERSION:-}
VIRTUAL_ENV=${VIRTUAL_ENV:-}
set -euo pipefail set -euo pipefail
@ -69,22 +70,25 @@ while [[ $# -gt 0 ]]; do
;; ;;
esac esac
done done
PYTHON_BINARY="python" PYTHON_BINARY="python"
case "$env_type" in case "$env_type" in
"venv") "venv")
# Activate virtual environment if [ -n "$VIRTUAL_ENV" && "$VIRTUAL_ENV" == "$env_path_or_name" ]; then
if [ ! -d "$env_path_or_name" ]; then echo -e "${GREEN}Virtual environment already activated${NC}" >&2
echo -e "${RED}Error: Virtual environment not found at $env_path_or_name${NC}" >&2 else
exit 1 # Activate virtual environment
fi if [ ! -d "$env_path_or_name" ]; then
echo -e "${RED}Error: Virtual environment not found at $env_path_or_name${NC}" >&2
exit 1
fi
if [ ! -f "$env_path_or_name/bin/activate" ]; then if [ ! -f "$env_path_or_name/bin/activate" ]; then
echo -e "${RED}Error: Virtual environment activate binary not found at $env_path_or_name/bin/activate" >&2 echo -e "${RED}Error: Virtual environment activate binary not found at $env_path_or_name/bin/activate" >&2
exit 1 exit 1
fi fi
source "$env_path_or_name/bin/activate" source "$env_path_or_name/bin/activate"
fi
;; ;;
"conda") "conda")
if ! is_command_available conda; then if ! is_command_available conda; then

View file

@ -58,6 +58,7 @@ def rag_chat_page():
llama_stack_api.client.tool_runtime.rag_tool.insert( llama_stack_api.client.tool_runtime.rag_tool.insert(
vector_db_id=vector_db_name, # Use the user-provided name vector_db_id=vector_db_name, # Use the user-provided name
documents=documents, documents=documents,
chunk_size_in_tokens=512,
) )
st.success("Vector database created successfully!") st.success("Vector database created successfully!")

View file

@ -18,15 +18,19 @@ def preserve_contexts_async_generator(
This is needed because we start a new asyncio event loop for each streaming request, This is needed because we start a new asyncio event loop for each streaming request,
and we need to preserve the context across the event loop boundary. and we need to preserve the context across the event loop boundary.
""" """
# Capture initial context values
initial_context_values = {context_var.name: context_var.get() for context_var in context_vars}
async def wrapper() -> AsyncGenerator[T, None]: async def wrapper() -> AsyncGenerator[T, None]:
while True: while True:
try: try:
item = await gen.__anext__() # Restore context values before any await
context_values = {context_var.name: context_var.get() for context_var in context_vars}
yield item
for context_var in context_vars: for context_var in context_vars:
_ = context_var.set(context_values[context_var.name]) context_var.set(initial_context_values[context_var.name])
item = await gen.__anext__()
yield item
except StopAsyncIteration: except StopAsyncIteration:
break break

View file

@ -139,7 +139,7 @@ def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None
category_levels (Dict[str, int]): A dictionary mapping categories to their log levels. category_levels (Dict[str, int]): A dictionary mapping categories to their log levels.
log_file (str): Path to a log file to additionally pipe the logs into log_file (str): Path to a log file to additionally pipe the logs into
""" """
log_format = "[dim]%(asctime)s %(name)s:%(lineno)d[/] [yellow dim]%(category)s[/]: %(message)s" log_format = "%(asctime)s %(name)s:%(lineno)d %(category)s: %(message)s"
class CategoryFilter(logging.Filter): class CategoryFilter(logging.Filter):
"""Ensure category is always present in log records.""" """Ensure category is always present in log records."""

View file

@ -195,10 +195,22 @@ register_schema(SamplingStrategy, name="SamplingStrategy")
@json_schema_type @json_schema_type
class SamplingParams(BaseModel): class SamplingParams(BaseModel):
"""Sampling parameters.
:param strategy: The sampling strategy.
:param max_tokens: The maximum number of tokens that can be generated in the completion. The token count of
your prompt plus max_tokens cannot exceed the model's context length.
:param repetition_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens
based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
:param stop: Up to 4 sequences where the API will stop generating further tokens.
The returned text will not contain the stop sequence.
"""
strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy) strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)
max_tokens: Optional[int] = 0 max_tokens: Optional[int] = 0
repetition_penalty: Optional[float] = 1.0 repetition_penalty: Optional[float] = 1.0
stop: Optional[List[str]] = None
class CheckpointQuantizationFormat(Enum): class CheckpointQuantizationFormat(Enum):

View file

@ -57,11 +57,7 @@ from llama_stack.apis.inference import (
UserMessage, UserMessage,
) )
from llama_stack.apis.safety import Safety from llama_stack.apis.safety import Safety
from llama_stack.apis.tools import ( from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
ToolGroups,
ToolInvocationResult,
ToolRuntime,
)
from llama_stack.apis.vector_io import VectorIO from llama_stack.apis.vector_io import VectorIO
from llama_stack.log import get_logger from llama_stack.log import get_logger
from llama_stack.models.llama.datatypes import ( from llama_stack.models.llama.datatypes import (
@ -459,7 +455,15 @@ class ChatAgent(ShieldRunnerMixin):
contexts.append(raw_document_text) contexts.append(raw_document_text)
attached_context = "\n".join(contexts) attached_context = "\n".join(contexts)
input_messages[-1].context = attached_context if isinstance(input_messages[-1].content, str):
input_messages[-1].content += attached_context
elif isinstance(input_messages[-1].content, list):
input_messages[-1].content.append(TextContentItem(text=attached_context))
else:
input_messages[-1].content = [
input_messages[-1].content,
TextContentItem(text=attached_context),
]
session_info = await self.storage.get_session_info(session_id) session_info = await self.storage.get_session_info(session_id)
# if the session has a memory bank id, let the memory tool use it # if the session has a memory bank id, let the memory tool use it

View file

@ -7,9 +7,11 @@ from typing import Any, Dict, List, Optional
import pandas import pandas
from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse from llama_stack.apis.common.responses import PaginatedResponse
from llama_stack.apis.datasetio import DatasetIO
from llama_stack.apis.datasets import Dataset from llama_stack.apis.datasets import Dataset
from llama_stack.providers.datatypes import DatasetsProtocolPrivate from llama_stack.providers.datatypes import DatasetsProtocolPrivate
from llama_stack.providers.utils.datasetio.pagination import paginate_records
from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
from llama_stack.providers.utils.kvstore import kvstore_impl from llama_stack.providers.utils.kvstore import kvstore_impl
@ -92,24 +94,13 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
dataset_id: str, dataset_id: str,
start_index: Optional[int] = None, start_index: Optional[int] = None,
limit: Optional[int] = None, limit: Optional[int] = None,
) -> IterrowsResponse: ) -> PaginatedResponse:
dataset_def = self.dataset_infos[dataset_id] dataset_def = self.dataset_infos[dataset_id]
dataset_impl = PandasDataframeDataset(dataset_def) dataset_impl = PandasDataframeDataset(dataset_def)
await dataset_impl.load() await dataset_impl.load()
start_index = start_index or 0 records = dataset_impl.df.to_dict("records")
return paginate_records(records, start_index, limit)
if limit is None or limit == -1:
end = len(dataset_impl)
else:
end = min(start_index + limit, len(dataset_impl))
rows = dataset_impl[start_index:end]
return IterrowsResponse(
data=rows,
next_start_index=end if end < len(dataset_impl) else None,
)
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
dataset_def = self.dataset_infos[dataset_id] dataset_def = self.dataset_infos[dataset_id]

View file

@ -28,6 +28,11 @@ class TelemetryConfig(BaseModel):
default="http://localhost:4318/v1/metrics", default="http://localhost:4318/v1/metrics",
description="The OpenTelemetry collector endpoint URL for metrics", description="The OpenTelemetry collector endpoint URL for metrics",
) )
service_name: str = Field(
# service name is always the same, use zero-width space to avoid clutter
default="",
description="The service name to use for telemetry",
)
sinks: List[TelemetrySink] = Field( sinks: List[TelemetrySink] = Field(
default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE], default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE],
description="List of telemetry sinks to enable (possible values: otel, sqlite, console)", description="List of telemetry sinks to enable (possible values: otel, sqlite, console)",
@ -47,6 +52,7 @@ class TelemetryConfig(BaseModel):
@classmethod @classmethod
def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]: def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
return { return {
"service_name": "${env.OTEL_SERVICE_NAME:}",
"sinks": "${env.TELEMETRY_SINKS:console,sqlite}", "sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
"sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}", "sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
} }

Some files were not shown because too many files have changed in this diff Show more