mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-03 17:29:01 +00:00
Merge branch 'main' into feat/litellm_sambanova_usage
This commit is contained in:
commit
9c9f9577e2
173 changed files with 3073 additions and 3118 deletions
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
|
@ -2,4 +2,4 @@
|
||||||
|
|
||||||
# These owners will be the default owners for everything in
|
# These owners will be the default owners for everything in
|
||||||
# the repo. Unless a later match takes precedence,
|
# the repo. Unless a later match takes precedence,
|
||||||
* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722
|
* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722 @leseb
|
||||||
|
|
17
.github/workflows/integration-tests.yml
vendored
17
.github/workflows/integration-tests.yml
vendored
|
@ -25,7 +25,8 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
# Listing tests manually since some of them currently fail
|
# Listing tests manually since some of them currently fail
|
||||||
# TODO: generate matrix list from tests/integration when fixed
|
# TODO: generate matrix list from tests/integration when fixed
|
||||||
test-type: [inference, datasets, inspect, scoring, post_training, providers]
|
test-type: [agents, inference, datasets, inspect, scoring, post_training, providers]
|
||||||
|
client-type: [library, http]
|
||||||
fail-fast: false # we want to run all tests regardless of failure
|
fail-fast: false # we want to run all tests regardless of failure
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
@ -54,6 +55,8 @@ jobs:
|
||||||
uv sync --extra dev --extra test
|
uv sync --extra dev --extra test
|
||||||
uv pip install ollama faiss-cpu
|
uv pip install ollama faiss-cpu
|
||||||
# always test against the latest version of the client
|
# always test against the latest version of the client
|
||||||
|
# TODO: this is not necessarily a good idea. we need to test against both published and latest
|
||||||
|
# to find out backwards compatibility issues.
|
||||||
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
|
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
|
||||||
uv pip install -e .
|
uv pip install -e .
|
||||||
llama stack build --template ollama --image-type venv
|
llama stack build --template ollama --image-type venv
|
||||||
|
@ -74,6 +77,7 @@ jobs:
|
||||||
exit 1
|
exit 1
|
||||||
|
|
||||||
- name: Start Llama Stack server in background
|
- name: Start Llama Stack server in background
|
||||||
|
if: matrix.client-type == 'http'
|
||||||
env:
|
env:
|
||||||
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
||||||
run: |
|
run: |
|
||||||
|
@ -81,6 +85,7 @@ jobs:
|
||||||
nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
|
nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
|
||||||
|
|
||||||
- name: Wait for Llama Stack server to be ready
|
- name: Wait for Llama Stack server to be ready
|
||||||
|
if: matrix.client-type == 'http'
|
||||||
run: |
|
run: |
|
||||||
echo "Waiting for Llama Stack server..."
|
echo "Waiting for Llama Stack server..."
|
||||||
for i in {1..30}; do
|
for i in {1..30}; do
|
||||||
|
@ -98,4 +103,12 @@ jobs:
|
||||||
env:
|
env:
|
||||||
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
||||||
run: |
|
run: |
|
||||||
uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
|
if [ "${{ matrix.client-type }}" == "library" ]; then
|
||||||
|
stack_config="ollama"
|
||||||
|
else
|
||||||
|
stack_config="http://localhost:8321"
|
||||||
|
fi
|
||||||
|
uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
|
||||||
|
-k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
|
||||||
|
--text-model="meta-llama/Llama-3.2-3B-Instruct" \
|
||||||
|
--embedding-model=all-MiniLM-L6-v2
|
||||||
|
|
71
CHANGELOG.md
71
CHANGELOG.md
|
@ -1,5 +1,76 @@
|
||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
# v0.1.8
|
||||||
|
Published on: 2025-03-24T01:28:50Z
|
||||||
|
|
||||||
|
# v0.1.8 Release Notes
|
||||||
|
|
||||||
|
### Build and Test Agents
|
||||||
|
* Safety: Integrated NVIDIA as a safety provider.
|
||||||
|
* VectorDB: Added Qdrant as an inline provider.
|
||||||
|
* Agents: Added support for multiple tool groups in agents.
|
||||||
|
* Agents: Simplified imports for Agents in client package
|
||||||
|
|
||||||
|
|
||||||
|
### Agent Evals and Model Customization
|
||||||
|
* Introduced DocVQA and IfEval benchmarks.
|
||||||
|
|
||||||
|
### Deploying and Monitoring Agents
|
||||||
|
* Introduced a Containerfile and image workflow for the Playground.
|
||||||
|
* Implemented support for Bearer (API Key) authentication.
|
||||||
|
* Added attribute-based access control for resources.
|
||||||
|
* Fixes on docker deployments: use --pull always and standardized the default port to 8321
|
||||||
|
* Deprecated: /v1/inspect/providers use /v1/providers/ instead
|
||||||
|
|
||||||
|
### Better Engineering
|
||||||
|
* Consolidated scripts under the ./scripts directory.
|
||||||
|
* Addressed mypy violations in various modules.
|
||||||
|
* Added Dependabot scans for Python dependencies.
|
||||||
|
* Implemented a scheduled workflow to update the changelog automatically.
|
||||||
|
* Enforced concurrency to reduce CI loads.
|
||||||
|
|
||||||
|
|
||||||
|
### New Contributors
|
||||||
|
* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
|
||||||
|
* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
|
||||||
|
* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
|
||||||
|
* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
|
||||||
|
|
||||||
|
**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# v0.1.7
|
||||||
|
Published on: 2025-03-14T22:30:51Z
|
||||||
|
|
||||||
|
## 0.1.7 Release Notes
|
||||||
|
|
||||||
|
### Build and Test Agents
|
||||||
|
* Inference: ImageType is now refactored to LlamaStackImageType
|
||||||
|
* Inference: Added tests to measure TTFT
|
||||||
|
* Inference: Bring back usage metrics
|
||||||
|
* Agents: Added endpoint for get agent, list agents and list sessions
|
||||||
|
* Agents: Automated conversion of type hints in client tool for lite llm format
|
||||||
|
* Agents: Deprecated ToolResponseMessage in agent.resume API
|
||||||
|
* Added Provider API for listing and inspecting provider info
|
||||||
|
|
||||||
|
### Agent Evals and Model Customization
|
||||||
|
* Eval: Added new eval benchmarks Math 500 and BFCL v3
|
||||||
|
* Deploy and Monitoring of Agents
|
||||||
|
* Telemetry: Fix tracing to work across coroutines
|
||||||
|
|
||||||
|
### Better Engineering
|
||||||
|
* Display code coverage for unit tests
|
||||||
|
* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
|
||||||
|
* Unit tests also run on Python 3.11, 3.12, and 3.13
|
||||||
|
* Added ollama inference to Integration tests CI
|
||||||
|
* Improved documentation across examples, testing, CLI, updated providers table )
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
# v0.1.6
|
# v0.1.6
|
||||||
Published on: 2025-03-08T04:35:08Z
|
Published on: 2025-03-08T04:35:08Z
|
||||||
|
|
||||||
|
|
|
@ -81,12 +81,14 @@ Note that you can create a dotenv file `.env` that includes necessary environmen
|
||||||
LLAMA_STACK_BASE_URL=http://localhost:8321
|
LLAMA_STACK_BASE_URL=http://localhost:8321
|
||||||
LLAMA_STACK_CLIENT_LOG=debug
|
LLAMA_STACK_CLIENT_LOG=debug
|
||||||
LLAMA_STACK_PORT=8321
|
LLAMA_STACK_PORT=8321
|
||||||
LLAMA_STACK_CONFIG=
|
LLAMA_STACK_CONFIG=<provider-name>
|
||||||
|
TAVILY_SEARCH_API_KEY=
|
||||||
|
BRAVE_SEARCH_API_KEY=
|
||||||
```
|
```
|
||||||
|
|
||||||
And then use this dotenv file when running client SDK tests via the following:
|
And then use this dotenv file when running client SDK tests via the following:
|
||||||
```bash
|
```bash
|
||||||
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py
|
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
## Pre-commit Hooks
|
## Pre-commit Hooks
|
||||||
|
@ -124,6 +126,10 @@ source .venv/bin/activate
|
||||||
PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
|
PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Running integration tests
|
||||||
|
|
||||||
|
You can run integration tests following the instructions [here](tests/integration/README.md).
|
||||||
|
|
||||||
## Adding a new dependency to the project
|
## Adding a new dependency to the project
|
||||||
|
|
||||||
To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
|
To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
include distributions/dependencies.json
|
include llama_stack/templates/dependencies.json
|
||||||
include llama_stack/models/llama/llama3/tokenizer.model
|
include llama_stack/models/llama/llama3/tokenizer.model
|
||||||
include llama_stack/distribution/*.sh
|
include llama_stack/distribution/*.sh
|
||||||
include llama_stack/cli/scripts/*.sh
|
include llama_stack/cli/scripts/*.sh
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/bedrock/build.yaml
|
|
|
@ -1,15 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: distribution-bedrock
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/llamastack-run-bedrock.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/bedrock/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/cerebras/build.yaml
|
|
|
@ -1,16 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-cerebras
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/llamastack-run-cerebras.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/cerebras/run.yaml
|
|
|
@ -1,50 +0,0 @@
|
||||||
services:
|
|
||||||
text-generation-inference:
|
|
||||||
image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/data
|
|
||||||
ports:
|
|
||||||
- "5009:5009"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1,2,3,4
|
|
||||||
- NUM_SHARD=4
|
|
||||||
- MAX_BATCH_PREFILL_TOKENS=32768
|
|
||||||
- MAX_INPUT_TOKENS=8000
|
|
||||||
- MAX_TOTAL_TOKENS=8192
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
# that's the closest analogue to --gpus; provide
|
|
||||||
# an integer amount of devices or 'all'
|
|
||||||
count: all
|
|
||||||
# Devices are reserved using a list of capabilities, making
|
|
||||||
# capabilities the only required field. A device MUST
|
|
||||||
# satisfy all the requested capabilities for a successful
|
|
||||||
# reservation.
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
llamastack:
|
|
||||||
depends_on:
|
|
||||||
text-generation-inference:
|
|
||||||
condition: service_healthy
|
|
||||||
image: llamastack/distribution-tgi
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
# Link to TGI run.yaml file
|
|
||||||
- ./run.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
# Hack: wait for TGI server to start before starting docker
|
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1,44 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: local
|
|
||||||
container_image: null
|
|
||||||
conda_env: local
|
|
||||||
apis:
|
|
||||||
- shields
|
|
||||||
- agents
|
|
||||||
- models
|
|
||||||
- memory
|
|
||||||
- memory_banks
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: tgi0
|
|
||||||
provider_type: remote::tgi
|
|
||||||
config:
|
|
||||||
url: http://127.0.0.1:80
|
|
||||||
safety:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
excluded_categories: []
|
|
||||||
- provider_id: meta1
|
|
||||||
provider_type: inline::prompt-guard
|
|
||||||
config:
|
|
||||||
model: Prompt-Guard-86M
|
|
||||||
memory:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::faiss
|
|
||||||
config: {}
|
|
||||||
agents:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
namespace: null
|
|
||||||
type: sqlite
|
|
||||||
db_path: ~/.llama/runtime/kvstore.db
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/fireworks/build.yaml
|
|
|
@ -1,14 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-fireworks
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
environment:
|
|
||||||
- FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/fireworks/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/meta-reference-gpu/build.yaml
|
|
|
@ -1,34 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-meta-reference-gpu
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
# that's the closest analogue to --gpus; provide
|
|
||||||
# an integer amount of devices or 'all'
|
|
||||||
count: 1
|
|
||||||
# Devices are reserved using a list of capabilities, making
|
|
||||||
# capabilities the only required field. A device MUST
|
|
||||||
# satisfy all the requested capabilities for a successful
|
|
||||||
# reservation.
|
|
||||||
capabilities: [gpu]
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
||||||
runtime: nvidia
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/meta-reference-gpu/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
|
|
|
@ -1,35 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-meta-reference-quantized-gpu
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
# that's the closest analogue to --gpus; provide
|
|
||||||
# an integer amount of devices or 'all'
|
|
||||||
count: 1
|
|
||||||
# Devices are reserved using a list of capabilities, making
|
|
||||||
# capabilities the only required field. A device MUST
|
|
||||||
# satisfy all the requested capabilities for a successful
|
|
||||||
# reservation.
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1,58 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: local
|
|
||||||
container_image: null
|
|
||||||
conda_env: local
|
|
||||||
apis:
|
|
||||||
- shields
|
|
||||||
- agents
|
|
||||||
- models
|
|
||||||
- memory
|
|
||||||
- memory_banks
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference-quantized
|
|
||||||
config:
|
|
||||||
model: Llama3.2-3B-Instruct:int4-qlora-eo8
|
|
||||||
quantization:
|
|
||||||
type: int4
|
|
||||||
torch_seed: null
|
|
||||||
max_seq_len: 2048
|
|
||||||
max_batch_size: 1
|
|
||||||
- provider_id: meta1
|
|
||||||
provider_type: inline::meta-reference-quantized
|
|
||||||
config:
|
|
||||||
# not a quantized model !
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
quantization: null
|
|
||||||
torch_seed: null
|
|
||||||
max_seq_len: 2048
|
|
||||||
max_batch_size: 1
|
|
||||||
safety:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
excluded_categories: []
|
|
||||||
- provider_id: meta1
|
|
||||||
provider_type: inline::prompt-guard
|
|
||||||
config:
|
|
||||||
model: Prompt-Guard-86M
|
|
||||||
memory:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
||||||
agents:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
namespace: null
|
|
||||||
type: sqlite
|
|
||||||
db_path: ~/.llama/runtime/kvstore.db
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/ollama/build.yaml
|
|
|
@ -1,71 +0,0 @@
|
||||||
services:
|
|
||||||
ollama:
|
|
||||||
image: ollama/ollama:latest
|
|
||||||
network_mode: ${NETWORK_MODE:-bridge}
|
|
||||||
volumes:
|
|
||||||
- ~/.ollama:/root/.ollama
|
|
||||||
ports:
|
|
||||||
- "11434:11434"
|
|
||||||
environment:
|
|
||||||
OLLAMA_DEBUG: 1
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
memory: 8G # Set maximum memory
|
|
||||||
reservations:
|
|
||||||
memory: 8G # Set minimum memory reservation
|
|
||||||
# healthcheck:
|
|
||||||
# # ugh, no CURL in ollama image
|
|
||||||
# test: ["CMD", "curl", "-f", "http://ollama:11434"]
|
|
||||||
# interval: 10s
|
|
||||||
# timeout: 5s
|
|
||||||
# retries: 5
|
|
||||||
|
|
||||||
ollama-init:
|
|
||||||
image: ollama/ollama:latest
|
|
||||||
depends_on:
|
|
||||||
- ollama
|
|
||||||
# condition: service_healthy
|
|
||||||
network_mode: ${NETWORK_MODE:-bridge}
|
|
||||||
environment:
|
|
||||||
- OLLAMA_HOST=ollama
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
|
||||||
volumes:
|
|
||||||
- ~/.ollama:/root/.ollama
|
|
||||||
- ./pull-models.sh:/pull-models.sh
|
|
||||||
entrypoint: ["/pull-models.sh"]
|
|
||||||
|
|
||||||
llamastack:
|
|
||||||
depends_on:
|
|
||||||
ollama:
|
|
||||||
condition: service_started
|
|
||||||
ollama-init:
|
|
||||||
condition: service_started
|
|
||||||
image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
|
|
||||||
network_mode: ${NETWORK_MODE:-bridge}
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
# Link to ollama run.yaml file
|
|
||||||
- ~/local/llama-stack/:/app/llama-stack-source
|
|
||||||
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
|
||||||
environment:
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
|
||||||
- OLLAMA_URL=http://ollama:11434
|
|
||||||
entrypoint: >
|
|
||||||
python -m llama_stack.distribution.server.server /root/my-run.yaml \
|
|
||||||
--port ${LLAMA_STACK_PORT:-8321}
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 10s
|
|
||||||
max_attempts: 3
|
|
||||||
window: 60s
|
|
||||||
volumes:
|
|
||||||
ollama:
|
|
||||||
ollama-init:
|
|
||||||
llamastack:
|
|
|
@ -1,18 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
|
|
||||||
for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
|
|
||||||
echo "Preloading $model..."
|
|
||||||
if ! ollama run "$model"; then
|
|
||||||
echo "Failed to pull and run $model"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "All models pulled successfully"
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/ollama/run-with-safety.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/ollama/run.yaml
|
|
Binary file not shown.
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/nvidia/build.yaml
|
|
|
@ -1,19 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: distribution-nvidia:dev
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/llamastack-run-nvidia.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
environment:
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
|
|
||||||
- NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/nvidia/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/remote-vllm/build.yaml
|
|
|
@ -1,99 +0,0 @@
|
||||||
services:
|
|
||||||
vllm-inference:
|
|
||||||
image: vllm/vllm-openai:latest
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
ports:
|
|
||||||
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
|
|
||||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
|
||||||
command: >
|
|
||||||
--gpu-memory-utilization 0.75
|
|
||||||
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
||||||
--enforce-eager
|
|
||||||
--max-model-len 8192
|
|
||||||
--max-num-seqs 16
|
|
||||||
--port ${VLLM_INFERENCE_PORT:-5100}
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
|
|
||||||
# A little trick:
|
|
||||||
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
|
|
||||||
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
|
|
||||||
vllm-${VLLM_SAFETY_MODEL:+safety}:
|
|
||||||
image: vllm/vllm-openai:latest
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
ports:
|
|
||||||
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
|
|
||||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
|
||||||
command: >
|
|
||||||
--gpu-memory-utilization 0.75
|
|
||||||
--model ${VLLM_SAFETY_MODEL}
|
|
||||||
--enforce-eager
|
|
||||||
--max-model-len 8192
|
|
||||||
--max-num-seqs 16
|
|
||||||
--port ${VLLM_SAFETY_PORT:-5101}
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
llamastack:
|
|
||||||
depends_on:
|
|
||||||
- vllm-inference:
|
|
||||||
condition: service_healthy
|
|
||||||
- vllm-${VLLM_SAFETY_MODEL:+safety}:
|
|
||||||
condition: service_healthy
|
|
||||||
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
environment:
|
|
||||||
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
|
|
||||||
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
||||||
- MAX_TOKENS=${MAX_TOKENS:-4096}
|
|
||||||
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
||||||
ports:
|
|
||||||
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
|
||||||
# Hack: wait for vLLM server to start before starting docker
|
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
||||||
volumes:
|
|
||||||
vllm-inference:
|
|
||||||
vllm-safety:
|
|
||||||
llamastack:
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/remote-vllm/run-with-safety.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/remote-vllm/run.yaml
|
|
|
@ -1,9 +0,0 @@
|
||||||
name: runpod
|
|
||||||
distribution_spec:
|
|
||||||
description: Use Runpod for running LLM inference
|
|
||||||
providers:
|
|
||||||
inference: remote::runpod
|
|
||||||
memory: meta-reference
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/sambanova/build.yaml
|
|
|
@ -1,16 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-sambanova
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/llamastack-run-sambanova.yaml
|
|
||||||
ports:
|
|
||||||
- "5000:5000"
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/sambanova/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/tgi/build.yaml
|
|
|
@ -1,103 +0,0 @@
|
||||||
services:
|
|
||||||
tgi-inference:
|
|
||||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/data
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
ports:
|
|
||||||
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
|
|
||||||
- HF_TOKEN=$HF_TOKEN
|
|
||||||
- HF_HOME=/data
|
|
||||||
- HF_DATASETS_CACHE=/data
|
|
||||||
- HF_MODULES_CACHE=/data
|
|
||||||
- HF_HUB_CACHE=/data
|
|
||||||
command: >
|
|
||||||
--dtype bfloat16
|
|
||||||
--usage-stats off
|
|
||||||
--sharded false
|
|
||||||
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
||||||
--port ${TGI_INFERENCE_PORT:-8080}
|
|
||||||
--cuda-memory-fraction 0.75
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
|
|
||||||
interval: 5s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 30
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
|
|
||||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
|
||||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/data
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
ports:
|
|
||||||
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
|
|
||||||
- HF_TOKEN=$HF_TOKEN
|
|
||||||
- HF_HOME=/data
|
|
||||||
- HF_DATASETS_CACHE=/data
|
|
||||||
- HF_MODULES_CACHE=/data
|
|
||||||
- HF_HUB_CACHE=/data
|
|
||||||
command: >
|
|
||||||
--dtype bfloat16
|
|
||||||
--usage-stats off
|
|
||||||
--sharded false
|
|
||||||
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
||||||
--port ${TGI_SAFETY_PORT:-8081}
|
|
||||||
--cuda-memory-fraction 0.75
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
|
|
||||||
interval: 5s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 30
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
|
|
||||||
llamastack:
|
|
||||||
depends_on:
|
|
||||||
tgi-inference:
|
|
||||||
condition: service_healthy
|
|
||||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
|
||||||
condition: service_healthy
|
|
||||||
image: llamastack/distribution-tgi:test-0.0.52rc3
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
|
||||||
# Hack: wait for TGI server to start before starting docker
|
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
||||||
environment:
|
|
||||||
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
|
|
||||||
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
tgi-inference:
|
|
||||||
tgi-safety:
|
|
||||||
llamastack:
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/tgi/run-with-safety.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/tgi/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/together/build.yaml
|
|
|
@ -1,14 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-together
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
environment:
|
|
||||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY}
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/together/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/inline-vllm/build.yaml
|
|
|
@ -1,35 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-inline-vllm
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
# that's the closest analogue to --gpus; provide
|
|
||||||
# an integer amount of devices or 'all'
|
|
||||||
count: 1
|
|
||||||
# Devices are reserved using a list of capabilities, making
|
|
||||||
# capabilities the only required field. A device MUST
|
|
||||||
# satisfy all the requested capabilities for a successful
|
|
||||||
# reservation.
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1,66 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: local
|
|
||||||
container_image: null
|
|
||||||
conda_env: local
|
|
||||||
apis:
|
|
||||||
- shields
|
|
||||||
- agents
|
|
||||||
- models
|
|
||||||
- memory
|
|
||||||
- memory_banks
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: vllm-inference
|
|
||||||
provider_type: inline::vllm
|
|
||||||
config:
|
|
||||||
model: Llama3.2-3B-Instruct
|
|
||||||
tensor_parallel_size: 1
|
|
||||||
gpu_memory_utilization: 0.4
|
|
||||||
enforce_eager: true
|
|
||||||
max_tokens: 4096
|
|
||||||
- provider_id: vllm-inference-safety
|
|
||||||
provider_type: inline::vllm
|
|
||||||
config:
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
tensor_parallel_size: 1
|
|
||||||
gpu_memory_utilization: 0.2
|
|
||||||
enforce_eager: true
|
|
||||||
max_tokens: 4096
|
|
||||||
safety:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
excluded_categories: []
|
|
||||||
# Uncomment to use prompt guard
|
|
||||||
# - provider_id: meta1
|
|
||||||
# provider_type: inline::prompt-guard
|
|
||||||
# config:
|
|
||||||
# model: Prompt-Guard-86M
|
|
||||||
memory:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
||||||
# Uncomment to use pgvector
|
|
||||||
# - provider_id: pgvector
|
|
||||||
# provider_type: remote::pgvector
|
|
||||||
# config:
|
|
||||||
# host: 127.0.0.1
|
|
||||||
# port: 5432
|
|
||||||
# db: postgres
|
|
||||||
# user: postgres
|
|
||||||
# password: mysecretpassword
|
|
||||||
agents:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
namespace: null
|
|
||||||
type: sqlite
|
|
||||||
db_path: ~/.llama/runtime/agents_store.db
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
155
docs/_static/llama-stack-spec.html
vendored
155
docs/_static/llama-stack-spec.html
vendored
|
@ -818,14 +818,7 @@
|
||||||
"delete": {
|
"delete": {
|
||||||
"responses": {
|
"responses": {
|
||||||
"200": {
|
"200": {
|
||||||
"description": "OK",
|
"description": "OK"
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/FileResponse"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"400": {
|
"400": {
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
"$ref": "#/components/responses/BadRequest400"
|
||||||
|
@ -2122,7 +2115,7 @@
|
||||||
"content": {
|
"content": {
|
||||||
"application/json": {
|
"application/json": {
|
||||||
"schema": {
|
"schema": {
|
||||||
"$ref": "#/components/schemas/IterrowsResponse"
|
"$ref": "#/components/schemas/PaginatedResponse"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2143,7 +2136,7 @@
|
||||||
"tags": [
|
"tags": [
|
||||||
"DatasetIO"
|
"DatasetIO"
|
||||||
],
|
],
|
||||||
"description": "Get a paginated list of rows from a dataset. Uses cursor-based pagination.",
|
"description": "Get a paginated list of rows from a dataset.\nUses offset-based pagination where:\n- start_index: The starting index (0-based). If None, starts from beginning.\n- limit: Number of items to return. If None or -1, returns all items.\n\nThe response includes:\n- data: List of items for the current page\n- has_more: Whether there are more items available after this set",
|
||||||
"parameters": [
|
"parameters": [
|
||||||
{
|
{
|
||||||
"name": "dataset_id",
|
"name": "dataset_id",
|
||||||
|
@ -2695,9 +2688,9 @@
|
||||||
"200": {
|
"200": {
|
||||||
"description": "OK",
|
"description": "OK",
|
||||||
"content": {
|
"content": {
|
||||||
"application/jsonl": {
|
"application/json": {
|
||||||
"schema": {
|
"schema": {
|
||||||
"$ref": "#/components/schemas/ToolDef"
|
"$ref": "#/components/schemas/ListToolDefsResponse"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4053,22 +4046,33 @@
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"strategy": {
|
"strategy": {
|
||||||
"$ref": "#/components/schemas/SamplingStrategy"
|
"$ref": "#/components/schemas/SamplingStrategy",
|
||||||
|
"description": "The sampling strategy."
|
||||||
},
|
},
|
||||||
"max_tokens": {
|
"max_tokens": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"default": 0
|
"default": 0,
|
||||||
|
"description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
|
||||||
},
|
},
|
||||||
"repetition_penalty": {
|
"repetition_penalty": {
|
||||||
"type": "number",
|
"type": "number",
|
||||||
"default": 1.0
|
"default": 1.0,
|
||||||
|
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
|
||||||
|
},
|
||||||
|
"stop": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"strategy"
|
"strategy"
|
||||||
],
|
],
|
||||||
"title": "SamplingParams"
|
"title": "SamplingParams",
|
||||||
|
"description": "Sampling parameters."
|
||||||
},
|
},
|
||||||
"SamplingStrategy": {
|
"SamplingStrategy": {
|
||||||
"oneOf": [
|
"oneOf": [
|
||||||
|
@ -6129,46 +6133,6 @@
|
||||||
"title": "FileUploadResponse",
|
"title": "FileUploadResponse",
|
||||||
"description": "Response after initiating a file upload session."
|
"description": "Response after initiating a file upload session."
|
||||||
},
|
},
|
||||||
"FileResponse": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"bucket": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
|
|
||||||
},
|
|
||||||
"key": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
|
|
||||||
},
|
|
||||||
"mime_type": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "MIME type of the file"
|
|
||||||
},
|
|
||||||
"url": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "Upload URL for the file contents"
|
|
||||||
},
|
|
||||||
"bytes": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "Size of the file in bytes"
|
|
||||||
},
|
|
||||||
"created_at": {
|
|
||||||
"type": "integer",
|
|
||||||
"description": "Timestamp of when the file was created"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"bucket",
|
|
||||||
"key",
|
|
||||||
"mime_type",
|
|
||||||
"url",
|
|
||||||
"bytes",
|
|
||||||
"created_at"
|
|
||||||
],
|
|
||||||
"title": "FileResponse",
|
|
||||||
"description": "Response representing a file entry."
|
|
||||||
},
|
|
||||||
"EmbeddingsRequest": {
|
"EmbeddingsRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -6922,6 +6886,46 @@
|
||||||
"title": "URIDataSource",
|
"title": "URIDataSource",
|
||||||
"description": "A dataset that can be obtained from a URI."
|
"description": "A dataset that can be obtained from a URI."
|
||||||
},
|
},
|
||||||
|
"FileResponse": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"bucket": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
|
||||||
|
},
|
||||||
|
"key": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
|
||||||
|
},
|
||||||
|
"mime_type": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "MIME type of the file"
|
||||||
|
},
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Upload URL for the file contents"
|
||||||
|
},
|
||||||
|
"bytes": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Size of the file in bytes"
|
||||||
|
},
|
||||||
|
"created_at": {
|
||||||
|
"type": "integer",
|
||||||
|
"description": "Timestamp of when the file was created"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"bucket",
|
||||||
|
"key",
|
||||||
|
"mime_type",
|
||||||
|
"url",
|
||||||
|
"bytes",
|
||||||
|
"created_at"
|
||||||
|
],
|
||||||
|
"title": "FileResponse",
|
||||||
|
"description": "Response representing a file entry."
|
||||||
|
},
|
||||||
"Model": {
|
"Model": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -7660,7 +7664,8 @@
|
||||||
"completed",
|
"completed",
|
||||||
"in_progress",
|
"in_progress",
|
||||||
"failed",
|
"failed",
|
||||||
"scheduled"
|
"scheduled",
|
||||||
|
"cancelled"
|
||||||
],
|
],
|
||||||
"title": "JobStatus"
|
"title": "JobStatus"
|
||||||
},
|
},
|
||||||
|
@ -8068,7 +8073,7 @@
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"title": "ToolInvocationResult"
|
"title": "ToolInvocationResult"
|
||||||
},
|
},
|
||||||
"IterrowsResponse": {
|
"PaginatedResponse": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"data": {
|
"data": {
|
||||||
|
@ -8098,19 +8103,20 @@
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"description": "The rows in the current page."
|
"description": "The list of items for the current page"
|
||||||
},
|
},
|
||||||
"next_start_index": {
|
"has_more": {
|
||||||
"type": "integer",
|
"type": "boolean",
|
||||||
"description": "Index into dataset for the first row in the next page. None if there are no more rows."
|
"description": "Whether there are more items available after this set"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false,
|
"additionalProperties": false,
|
||||||
"required": [
|
"required": [
|
||||||
"data"
|
"data",
|
||||||
|
"has_more"
|
||||||
],
|
],
|
||||||
"title": "IterrowsResponse",
|
"title": "PaginatedResponse",
|
||||||
"description": "A paginated list of rows from a dataset."
|
"description": "A generic paginated response that follows a simple format."
|
||||||
},
|
},
|
||||||
"Job": {
|
"Job": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
@ -8124,7 +8130,8 @@
|
||||||
"completed",
|
"completed",
|
||||||
"in_progress",
|
"in_progress",
|
||||||
"failed",
|
"failed",
|
||||||
"scheduled"
|
"scheduled",
|
||||||
|
"cancelled"
|
||||||
],
|
],
|
||||||
"title": "JobStatus"
|
"title": "JobStatus"
|
||||||
}
|
}
|
||||||
|
@ -8321,6 +8328,22 @@
|
||||||
],
|
],
|
||||||
"title": "ListRoutesResponse"
|
"title": "ListRoutesResponse"
|
||||||
},
|
},
|
||||||
|
"ListToolDefsResponse": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"data": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/components/schemas/ToolDef"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": false,
|
||||||
|
"required": [
|
||||||
|
"data"
|
||||||
|
],
|
||||||
|
"title": "ListToolDefsResponse"
|
||||||
|
},
|
||||||
"ListScoringFunctionsResponse": {
|
"ListScoringFunctionsResponse": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
138
docs/_static/llama-stack-spec.yaml
vendored
138
docs/_static/llama-stack-spec.yaml
vendored
|
@ -557,10 +557,6 @@ paths:
|
||||||
responses:
|
responses:
|
||||||
'200':
|
'200':
|
||||||
description: OK
|
description: OK
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/FileResponse'
|
|
||||||
'400':
|
'400':
|
||||||
$ref: '#/components/responses/BadRequest400'
|
$ref: '#/components/responses/BadRequest400'
|
||||||
'429':
|
'429':
|
||||||
|
@ -1447,7 +1443,7 @@ paths:
|
||||||
content:
|
content:
|
||||||
application/json:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/IterrowsResponse'
|
$ref: '#/components/schemas/PaginatedResponse'
|
||||||
'400':
|
'400':
|
||||||
$ref: '#/components/responses/BadRequest400'
|
$ref: '#/components/responses/BadRequest400'
|
||||||
'429':
|
'429':
|
||||||
|
@ -1461,7 +1457,20 @@ paths:
|
||||||
tags:
|
tags:
|
||||||
- DatasetIO
|
- DatasetIO
|
||||||
description: >-
|
description: >-
|
||||||
Get a paginated list of rows from a dataset. Uses cursor-based pagination.
|
Get a paginated list of rows from a dataset.
|
||||||
|
|
||||||
|
Uses offset-based pagination where:
|
||||||
|
|
||||||
|
- start_index: The starting index (0-based). If None, starts from beginning.
|
||||||
|
|
||||||
|
- limit: Number of items to return. If None or -1, returns all items.
|
||||||
|
|
||||||
|
|
||||||
|
The response includes:
|
||||||
|
|
||||||
|
- data: List of items for the current page
|
||||||
|
|
||||||
|
- has_more: Whether there are more items available after this set
|
||||||
parameters:
|
parameters:
|
||||||
- name: dataset_id
|
- name: dataset_id
|
||||||
in: path
|
in: path
|
||||||
|
@ -1846,9 +1855,9 @@ paths:
|
||||||
'200':
|
'200':
|
||||||
description: OK
|
description: OK
|
||||||
content:
|
content:
|
||||||
application/jsonl:
|
application/json:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/ToolDef'
|
$ref: '#/components/schemas/ListToolDefsResponse'
|
||||||
'400':
|
'400':
|
||||||
$ref: '#/components/responses/BadRequest400'
|
$ref: '#/components/responses/BadRequest400'
|
||||||
'429':
|
'429':
|
||||||
|
@ -2787,16 +2796,33 @@ components:
|
||||||
properties:
|
properties:
|
||||||
strategy:
|
strategy:
|
||||||
$ref: '#/components/schemas/SamplingStrategy'
|
$ref: '#/components/schemas/SamplingStrategy'
|
||||||
|
description: The sampling strategy.
|
||||||
max_tokens:
|
max_tokens:
|
||||||
type: integer
|
type: integer
|
||||||
default: 0
|
default: 0
|
||||||
|
description: >-
|
||||||
|
The maximum number of tokens that can be generated in the completion.
|
||||||
|
The token count of your prompt plus max_tokens cannot exceed the model's
|
||||||
|
context length.
|
||||||
repetition_penalty:
|
repetition_penalty:
|
||||||
type: number
|
type: number
|
||||||
default: 1.0
|
default: 1.0
|
||||||
|
description: >-
|
||||||
|
Number between -2.0 and 2.0. Positive values penalize new tokens based
|
||||||
|
on whether they appear in the text so far, increasing the model's likelihood
|
||||||
|
to talk about new topics.
|
||||||
|
stop:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
Up to 4 sequences where the API will stop generating further tokens. The
|
||||||
|
returned text will not contain the stop sequence.
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- strategy
|
- strategy
|
||||||
title: SamplingParams
|
title: SamplingParams
|
||||||
|
description: Sampling parameters.
|
||||||
SamplingStrategy:
|
SamplingStrategy:
|
||||||
oneOf:
|
oneOf:
|
||||||
- $ref: '#/components/schemas/GreedySamplingStrategy'
|
- $ref: '#/components/schemas/GreedySamplingStrategy'
|
||||||
|
@ -4269,39 +4295,6 @@ components:
|
||||||
title: FileUploadResponse
|
title: FileUploadResponse
|
||||||
description: >-
|
description: >-
|
||||||
Response after initiating a file upload session.
|
Response after initiating a file upload session.
|
||||||
FileResponse:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
bucket:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
|
|
||||||
key:
|
|
||||||
type: string
|
|
||||||
description: >-
|
|
||||||
Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
|
|
||||||
mime_type:
|
|
||||||
type: string
|
|
||||||
description: MIME type of the file
|
|
||||||
url:
|
|
||||||
type: string
|
|
||||||
description: Upload URL for the file contents
|
|
||||||
bytes:
|
|
||||||
type: integer
|
|
||||||
description: Size of the file in bytes
|
|
||||||
created_at:
|
|
||||||
type: integer
|
|
||||||
description: Timestamp of when the file was created
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- bucket
|
|
||||||
- key
|
|
||||||
- mime_type
|
|
||||||
- url
|
|
||||||
- bytes
|
|
||||||
- created_at
|
|
||||||
title: FileResponse
|
|
||||||
description: Response representing a file entry.
|
|
||||||
EmbeddingsRequest:
|
EmbeddingsRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -4813,6 +4806,39 @@ components:
|
||||||
title: URIDataSource
|
title: URIDataSource
|
||||||
description: >-
|
description: >-
|
||||||
A dataset that can be obtained from a URI.
|
A dataset that can be obtained from a URI.
|
||||||
|
FileResponse:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
bucket:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
|
||||||
|
key:
|
||||||
|
type: string
|
||||||
|
description: >-
|
||||||
|
Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
|
||||||
|
mime_type:
|
||||||
|
type: string
|
||||||
|
description: MIME type of the file
|
||||||
|
url:
|
||||||
|
type: string
|
||||||
|
description: Upload URL for the file contents
|
||||||
|
bytes:
|
||||||
|
type: integer
|
||||||
|
description: Size of the file in bytes
|
||||||
|
created_at:
|
||||||
|
type: integer
|
||||||
|
description: Timestamp of when the file was created
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- bucket
|
||||||
|
- key
|
||||||
|
- mime_type
|
||||||
|
- url
|
||||||
|
- bytes
|
||||||
|
- created_at
|
||||||
|
title: FileResponse
|
||||||
|
description: Response representing a file entry.
|
||||||
Model:
|
Model:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -5289,6 +5315,7 @@ components:
|
||||||
- in_progress
|
- in_progress
|
||||||
- failed
|
- failed
|
||||||
- scheduled
|
- scheduled
|
||||||
|
- cancelled
|
||||||
title: JobStatus
|
title: JobStatus
|
||||||
scheduled_at:
|
scheduled_at:
|
||||||
type: string
|
type: string
|
||||||
|
@ -5528,7 +5555,7 @@ components:
|
||||||
- type: object
|
- type: object
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
title: ToolInvocationResult
|
title: ToolInvocationResult
|
||||||
IterrowsResponse:
|
PaginatedResponse:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
data:
|
data:
|
||||||
|
@ -5543,17 +5570,18 @@ components:
|
||||||
- type: string
|
- type: string
|
||||||
- type: array
|
- type: array
|
||||||
- type: object
|
- type: object
|
||||||
description: The rows in the current page.
|
description: The list of items for the current page
|
||||||
next_start_index:
|
has_more:
|
||||||
type: integer
|
type: boolean
|
||||||
description: >-
|
description: >-
|
||||||
Index into dataset for the first row in the next page. None if there are
|
Whether there are more items available after this set
|
||||||
no more rows.
|
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
- data
|
- data
|
||||||
title: IterrowsResponse
|
- has_more
|
||||||
description: A paginated list of rows from a dataset.
|
title: PaginatedResponse
|
||||||
|
description: >-
|
||||||
|
A generic paginated response that follows a simple format.
|
||||||
Job:
|
Job:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -5566,6 +5594,7 @@ components:
|
||||||
- in_progress
|
- in_progress
|
||||||
- failed
|
- failed
|
||||||
- scheduled
|
- scheduled
|
||||||
|
- cancelled
|
||||||
title: JobStatus
|
title: JobStatus
|
||||||
additionalProperties: false
|
additionalProperties: false
|
||||||
required:
|
required:
|
||||||
|
@ -5703,6 +5732,17 @@ components:
|
||||||
required:
|
required:
|
||||||
- data
|
- data
|
||||||
title: ListRoutesResponse
|
title: ListRoutesResponse
|
||||||
|
ListToolDefsResponse:
|
||||||
|
type: object
|
||||||
|
properties:
|
||||||
|
data:
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
$ref: '#/components/schemas/ToolDef'
|
||||||
|
additionalProperties: false
|
||||||
|
required:
|
||||||
|
- data
|
||||||
|
title: ListToolDefsResponse
|
||||||
ListScoringFunctionsResponse:
|
ListScoringFunctionsResponse:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
|
BIN
docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
vendored
Normal file
BIN
docs/_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 33 KiB |
BIN
docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
vendored
Normal file
BIN
docs/_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 37 KiB |
BIN
docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
vendored
Normal file
BIN
docs/_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 56 KiB |
File diff suppressed because one or more lines are too long
|
@ -963,16 +963,19 @@
|
||||||
"\n",
|
"\n",
|
||||||
"client.benchmarks.register(\n",
|
"client.benchmarks.register(\n",
|
||||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||||
|
" # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
|
||||||
|
" # `input_rows` argument and does not fetch data from the dataset.\n",
|
||||||
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
|
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
|
||||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
" # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
|
||||||
|
" scoring_functions=[],\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows_alpha(\n",
|
"response = client.eval.evaluate_rows(\n",
|
||||||
" benchmark_id=\"meta-reference::mmmu\",\n",
|
" benchmark_id=\"meta-reference::mmmu\",\n",
|
||||||
" input_rows=eval_rows,\n",
|
" input_rows=eval_rows,\n",
|
||||||
|
" # Note: Here we define the actual scoring functions.\n",
|
||||||
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
|
||||||
" benchmark_config={\n",
|
" benchmark_config={\n",
|
||||||
" \"type\": \"benchmark\",\n",
|
|
||||||
" \"eval_candidate\": {\n",
|
" \"eval_candidate\": {\n",
|
||||||
" \"type\": \"model\",\n",
|
" \"type\": \"model\",\n",
|
||||||
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
||||||
|
@ -1139,12 +1142,11 @@
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows_alpha(\n",
|
"response = client.eval.evaluate_rows(\n",
|
||||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||||
" input_rows=eval_rows.data,\n",
|
" input_rows=eval_rows.data,\n",
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
" benchmark_config={\n",
|
" benchmark_config={\n",
|
||||||
" \"type\": \"benchmark\",\n",
|
|
||||||
" \"eval_candidate\": {\n",
|
" \"eval_candidate\": {\n",
|
||||||
" \"type\": \"model\",\n",
|
" \"type\": \"model\",\n",
|
||||||
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
|
||||||
|
@ -1288,12 +1290,11 @@
|
||||||
" \"enable_session_persistence\": False,\n",
|
" \"enable_session_persistence\": False,\n",
|
||||||
"}\n",
|
"}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"response = client.eval.evaluate_rows_alpha(\n",
|
"response = client.eval.evaluate_rows(\n",
|
||||||
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
" benchmark_id=\"meta-reference::simpleqa\",\n",
|
||||||
" input_rows=eval_rows.data,\n",
|
" input_rows=eval_rows.data,\n",
|
||||||
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
|
||||||
" benchmark_config={\n",
|
" benchmark_config={\n",
|
||||||
" \"type\": \"benchmark\",\n",
|
|
||||||
" \"eval_candidate\": {\n",
|
" \"eval_candidate\": {\n",
|
||||||
" \"type\": \"agent\",\n",
|
" \"type\": \"agent\",\n",
|
||||||
" \"config\": agent_config,\n",
|
" \"config\": agent_config,\n",
|
||||||
|
|
|
@ -21,7 +21,7 @@ from llama_stack.distribution.stack import LlamaStack # noqa: E402
|
||||||
|
|
||||||
from .pyopenapi.options import Options # noqa: E402
|
from .pyopenapi.options import Options # noqa: E402
|
||||||
from .pyopenapi.specification import Info, Server # noqa: E402
|
from .pyopenapi.specification import Info, Server # noqa: E402
|
||||||
from .pyopenapi.utility import Specification, validate_api_method_return_types # noqa: E402
|
from .pyopenapi.utility import Specification, validate_api # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
def str_presenter(dumper, data):
|
def str_presenter(dumper, data):
|
||||||
|
@ -40,8 +40,7 @@ def main(output_dir: str):
|
||||||
raise ValueError(f"Directory {output_dir} does not exist")
|
raise ValueError(f"Directory {output_dir} does not exist")
|
||||||
|
|
||||||
# Validate API protocols before generating spec
|
# Validate API protocols before generating spec
|
||||||
print("Validating API method return types...")
|
return_type_errors = validate_api()
|
||||||
return_type_errors = validate_api_method_return_types()
|
|
||||||
if return_type_errors:
|
if return_type_errors:
|
||||||
print("\nAPI Method Return Type Validation Errors:\n")
|
print("\nAPI Method Return Type Validation Errors:\n")
|
||||||
for error in return_type_errors:
|
for error in return_type_errors:
|
||||||
|
|
|
@ -7,10 +7,9 @@
|
||||||
import json
|
import json
|
||||||
import typing
|
import typing
|
||||||
import inspect
|
import inspect
|
||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TextIO
|
from typing import TextIO
|
||||||
from typing import Any, Dict, List, Optional, Protocol, Type, Union, get_type_hints, get_origin, get_args
|
from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
|
||||||
|
|
||||||
from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
|
from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
|
||||||
from llama_stack.distribution.resolver import api_protocol_map
|
from llama_stack.distribution.resolver import api_protocol_map
|
||||||
|
@ -125,29 +124,89 @@ def is_optional_type(type_: Any) -> bool:
|
||||||
return origin is Optional or (origin is Union and type(None) in args)
|
return origin is Optional or (origin is Union and type(None) in args)
|
||||||
|
|
||||||
|
|
||||||
def validate_api_method_return_types() -> List[str]:
|
def _validate_api_method_return_type(method) -> str | None:
|
||||||
"""Validate that all API methods have proper return types."""
|
hints = get_type_hints(method)
|
||||||
|
|
||||||
|
if 'return' not in hints:
|
||||||
|
return "has no return type annotation"
|
||||||
|
|
||||||
|
return_type = hints['return']
|
||||||
|
if is_optional_type(return_type):
|
||||||
|
return "returns Optional type where a return value is mandatory"
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_api_method_doesnt_return_list(method) -> str | None:
|
||||||
|
hints = get_type_hints(method)
|
||||||
|
|
||||||
|
if 'return' not in hints:
|
||||||
|
return "has no return type annotation"
|
||||||
|
|
||||||
|
return_type = hints['return']
|
||||||
|
if get_origin(return_type) is list:
|
||||||
|
return "returns a list where a PaginatedResponse or List*Response object is expected"
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_api_delete_method_returns_none(method) -> str | None:
|
||||||
|
hints = get_type_hints(method)
|
||||||
|
|
||||||
|
if 'return' not in hints:
|
||||||
|
return "has no return type annotation"
|
||||||
|
|
||||||
|
return_type = hints['return']
|
||||||
|
if return_type is not None and return_type is not type(None):
|
||||||
|
return "does not return None where None is mandatory"
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_list_parameters_contain_data(method) -> str | None:
|
||||||
|
hints = get_type_hints(method)
|
||||||
|
|
||||||
|
if 'return' not in hints:
|
||||||
|
return "has no return type annotation"
|
||||||
|
|
||||||
|
return_type = hints['return']
|
||||||
|
if not inspect.isclass(return_type):
|
||||||
|
return
|
||||||
|
|
||||||
|
if not return_type.__name__.startswith('List'):
|
||||||
|
return
|
||||||
|
|
||||||
|
if 'data' not in return_type.model_fields:
|
||||||
|
return "does not have a mandatory data attribute containing the list of objects"
|
||||||
|
|
||||||
|
|
||||||
|
_VALIDATORS = {
|
||||||
|
"GET": [
|
||||||
|
_validate_api_method_return_type,
|
||||||
|
_validate_list_parameters_contain_data,
|
||||||
|
_validate_api_method_doesnt_return_list,
|
||||||
|
],
|
||||||
|
"DELETE": [
|
||||||
|
_validate_api_delete_method_returns_none,
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_methods_by_type(protocol, method_type: str):
|
||||||
|
members = inspect.getmembers(protocol, predicate=inspect.isfunction)
|
||||||
|
return {
|
||||||
|
method_name: method
|
||||||
|
for method_name, method in members
|
||||||
|
if (webmethod := getattr(method, '__webmethod__', None))
|
||||||
|
if webmethod and webmethod.method == method_type
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def validate_api() -> List[str]:
|
||||||
|
"""Validate the API protocols."""
|
||||||
errors = []
|
errors = []
|
||||||
protocols = api_protocol_map()
|
protocols = api_protocol_map()
|
||||||
|
|
||||||
for protocol_name, protocol in protocols.items():
|
for target, validators in _VALIDATORS.items():
|
||||||
methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
|
for protocol_name, protocol in protocols.items():
|
||||||
|
for validator in validators:
|
||||||
for method_name, method in methods:
|
for method_name, method in _get_methods_by_type(protocol, target).items():
|
||||||
if not hasattr(method, '__webmethod__'):
|
err = validator(method)
|
||||||
continue
|
if err:
|
||||||
|
errors.append(f"Method {protocol_name}.{method_name} {err}")
|
||||||
# Only check GET methods
|
|
||||||
if method.__webmethod__.method != "GET":
|
|
||||||
continue
|
|
||||||
|
|
||||||
hints = get_type_hints(method)
|
|
||||||
|
|
||||||
if 'return' not in hints:
|
|
||||||
errors.append(f"Method {protocol_name}.{method_name} has no return type annotation")
|
|
||||||
else:
|
|
||||||
return_type = hints['return']
|
|
||||||
if is_optional_type(return_type):
|
|
||||||
errors.append(f"Method {protocol_name}.{method_name} returns Optional type")
|
|
||||||
|
|
||||||
return errors
|
return errors
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Building AI Applications
|
# Building AI Applications (Examples)
|
||||||
|
|
||||||
Llama Stack provides all the building blocks needed to create sophisticated AI applications.
|
Llama Stack provides all the building blocks needed to create sophisticated AI applications.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
## Using Retrieval Augmented Generation (RAG)
|
## Retrieval Augmented Generation (RAG)
|
||||||
|
|
||||||
RAG enables your applications to reference and recall information from previous interactions or external documents.
|
RAG enables your applications to reference and recall information from previous interactions or external documents.
|
||||||
|
|
||||||
|
|
|
@ -45,14 +45,16 @@ Here's an example that sends telemetry signals to all three sink types. Your con
|
||||||
- provider_id: meta-reference
|
- provider_id: meta-reference
|
||||||
provider_type: inline::meta-reference
|
provider_type: inline::meta-reference
|
||||||
config:
|
config:
|
||||||
sinks: ['console', 'sqlite', 'otel']
|
sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
|
||||||
otel_endpoint: "http://localhost:4318/v1/traces"
|
otel_trace_endpoint: "http://localhost:4318/v1/traces"
|
||||||
|
otel_metric_endpoint: "http://localhost:4318/v1/metrics"
|
||||||
sqlite_db_path: "/path/to/telemetry.db"
|
sqlite_db_path: "/path/to/telemetry.db"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Jaeger to visualize traces
|
### Jaeger to visualize traces
|
||||||
|
|
||||||
The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data.
|
The `otel` sink works with any service compatible with the OpenTelemetry collector, traces and metrics has two separate endpoints.
|
||||||
|
Let's use Jaeger to visualize this data.
|
||||||
|
|
||||||
Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:
|
Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ from docutils import nodes
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import requests
|
import requests
|
||||||
import json
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
# Read version from pyproject.toml
|
# Read version from pyproject.toml
|
||||||
with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
|
with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
|
||||||
|
@ -28,7 +29,7 @@ with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") a
|
||||||
llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"
|
llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"
|
||||||
|
|
||||||
project = "llama-stack"
|
project = "llama-stack"
|
||||||
copyright = "2025, Meta"
|
copyright = f"{datetime.now().year}, Meta"
|
||||||
author = "Meta"
|
author = "Meta"
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
@ -37,6 +38,7 @@ author = "Meta"
|
||||||
extensions = [
|
extensions = [
|
||||||
"myst_parser",
|
"myst_parser",
|
||||||
"sphinx_rtd_theme",
|
"sphinx_rtd_theme",
|
||||||
|
"sphinx_rtd_dark_mode",
|
||||||
"sphinx_copybutton",
|
"sphinx_copybutton",
|
||||||
"sphinx_tabs.tabs",
|
"sphinx_tabs.tabs",
|
||||||
"sphinx_design",
|
"sphinx_design",
|
||||||
|
@ -103,6 +105,8 @@ source_suffix = {
|
||||||
# html_theme = "alabaster"
|
# html_theme = "alabaster"
|
||||||
html_theme_options = {
|
html_theme_options = {
|
||||||
"canonical_url": "https://github.com/meta-llama/llama-stack",
|
"canonical_url": "https://github.com/meta-llama/llama-stack",
|
||||||
|
'collapse_navigation': False,
|
||||||
|
|
||||||
# "style_nav_header_background": "#c3c9d4",
|
# "style_nav_header_background": "#c3c9d4",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
# Contributing to Llama Stack
|
|
||||||
|
|
||||||
Start with the [Contributing Guide](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md) for some general tips. This section covers a few key topics in more detail.
|
```{include} ../../../CONTRIBUTING.md
|
||||||
|
```
|
||||||
|
|
||||||
|
See the [Adding a New API Provider](new_api_provider.md) which describes how to add new API providers to the Stack.
|
||||||
|
|
||||||
|
|
||||||
- [Adding a New API Provider](new_api_provider.md) describes adding new API providers to the Stack.
|
|
||||||
- [Testing Llama Stack](testing.md) provides details about the testing framework and how to test providers and distributions.
|
|
||||||
|
|
||||||
```{toctree}
|
```{toctree}
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
:hidden:
|
:hidden:
|
||||||
|
|
||||||
new_api_provider
|
new_api_provider
|
||||||
testing
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -67,7 +67,7 @@ options:
|
||||||
Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
|
Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
|
||||||
conda)
|
conda)
|
||||||
--image-name IMAGE_NAME
|
--image-name IMAGE_NAME
|
||||||
[for image-type=conda|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
|
[for image-type=conda|container|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
|
||||||
found. (default: None)
|
found. (default: None)
|
||||||
--print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
|
--print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
|
||||||
--run Run the stack after building using the same image type, name, and other applicable arguments (default: False)
|
--run Run the stack after building using the same image type, name, and other applicable arguments (default: False)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Configuring a Stack
|
# Configuring a "Stack"
|
||||||
|
|
||||||
The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
|
The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
# Using Llama Stack as a Library
|
# Using Llama Stack as a Library
|
||||||
|
|
||||||
If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
|
## Setup Llama Stack without a Server
|
||||||
|
If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library.
|
||||||
|
This avoids the overhead of setting up a server.
|
||||||
```bash
|
```bash
|
||||||
# setup
|
# setup
|
||||||
uv pip install llama-stack
|
uv pip install llama-stack
|
||||||
llama stack build --template together --image-type venv
|
llama stack build --template ollama --image-type venv
|
||||||
```
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
@ -1,34 +1,18 @@
|
||||||
# Starting a Llama Stack Server
|
# Distributions Overview
|
||||||
|
|
||||||
You can run a Llama Stack server in one of the following ways:
|
A distribution is a pre-packaged set of Llama Stack components that can be deployed together.
|
||||||
|
|
||||||
**As a Library**:
|
|
||||||
|
|
||||||
This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
|
|
||||||
|
|
||||||
|
|
||||||
**Container**:
|
|
||||||
|
|
||||||
Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
|
|
||||||
|
|
||||||
|
|
||||||
**Conda**:
|
|
||||||
|
|
||||||
If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
|
|
||||||
|
|
||||||
|
|
||||||
**Kubernetes**:
|
|
||||||
|
|
||||||
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
|
|
||||||
|
|
||||||
|
This section provides an overview of the distributions available in Llama Stack.
|
||||||
|
|
||||||
```{toctree}
|
```{toctree}
|
||||||
:maxdepth: 1
|
:maxdepth: 3
|
||||||
:hidden:
|
|
||||||
|
|
||||||
importing_as_library
|
importing_as_library
|
||||||
building_distro
|
|
||||||
configuration
|
configuration
|
||||||
selection
|
list_of_distributions
|
||||||
kubernetes_deployment
|
kubernetes_deployment
|
||||||
|
building_distro
|
||||||
|
on_device_distro
|
||||||
|
remote_hosted_distro
|
||||||
|
self_hosted_distro
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
# Kubernetes Deployment Guide
|
# Kubernetes Deployment Guide
|
||||||
|
|
||||||
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
|
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
|
||||||
|
|
||||||
First, create a local Kubernetes cluster via Kind:
|
First, create a local Kubernetes cluster via Kind:
|
||||||
|
|
||||||
|
@ -8,7 +11,7 @@ First, create a local Kubernetes cluster via Kind:
|
||||||
kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
|
kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
|
||||||
```
|
```
|
||||||
|
|
||||||
Start vLLM server as a Kubernetes Pod and Service:
|
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cat <<EOF |kubectl apply -f -
|
cat <<EOF |kubectl apply -f -
|
||||||
|
@ -31,7 +34,13 @@ metadata:
|
||||||
type: Opaque
|
type: Opaque
|
||||||
data:
|
data:
|
||||||
token: $(HF_TOKEN)
|
token: $(HF_TOKEN)
|
||||||
---
|
```
|
||||||
|
|
||||||
|
|
||||||
|
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cat <<EOF |kubectl apply -f -
|
||||||
apiVersion: apps/v1
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
|
@ -47,28 +56,23 @@ spec:
|
||||||
app.kubernetes.io/name: vllm
|
app.kubernetes.io/name: vllm
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: llama-stack
|
- name: vllm
|
||||||
image: $(VLLM_IMAGE)
|
image: vllm/vllm-openai:latest
|
||||||
command:
|
command: ["/bin/sh", "-c"]
|
||||||
- bash
|
args: [
|
||||||
- -c
|
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
|
||||||
- |
|
]
|
||||||
MODEL="meta-llama/Llama-3.2-1B-Instruct"
|
env:
|
||||||
MODEL_PATH=/app/model/$(basename $MODEL)
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
|
valueFrom:
|
||||||
huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
|
secretKeyRef:
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8000
|
- containerPort: 8000
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: llama-storage
|
- name: llama-storage
|
||||||
mountPath: /app/model
|
mountPath: /root/.cache/huggingface
|
||||||
env:
|
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
volumes:
|
volumes:
|
||||||
- name: llama-storage
|
- name: llama-storage
|
||||||
persistentVolumeClaim:
|
persistentVolumeClaim:
|
||||||
|
@ -127,6 +131,7 @@ EOF
|
||||||
podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
|
podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Deploying Llama Stack Server in Kubernetes
|
||||||
|
|
||||||
We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:
|
We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:
|
||||||
|
|
||||||
|
@ -187,6 +192,7 @@ spec:
|
||||||
EOF
|
EOF
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Verifying the Deployment
|
||||||
We can check that the LlamaStack server has started:
|
We can check that the LlamaStack server has started:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# List of Distributions
|
# Available List of Distributions
|
||||||
|
|
||||||
Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box.
|
Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box.
|
||||||
|
|
|
@ -9,6 +9,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
||||||
| datasetio | `inline::localfs` |
|
| datasetio | `inline::localfs` |
|
||||||
| eval | `inline::meta-reference` |
|
| eval | `inline::meta-reference` |
|
||||||
| inference | `remote::nvidia` |
|
| inference | `remote::nvidia` |
|
||||||
|
| post_training | `remote::nvidia` |
|
||||||
| safety | `remote::nvidia` |
|
| safety | `remote::nvidia` |
|
||||||
| scoring | `inline::basic` |
|
| scoring | `inline::basic` |
|
||||||
| telemetry | `inline::meta-reference` |
|
| telemetry | `inline::meta-reference` |
|
||||||
|
@ -21,6 +22,12 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||||
|
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
|
||||||
|
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
||||||
|
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
|
||||||
|
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
||||||
|
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||||
|
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||||
|
|
|
@ -98,11 +98,14 @@ export INFERENCE_PORT=8000
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
export LLAMA_STACK_PORT=8321
|
export LLAMA_STACK_PORT=8321
|
||||||
|
|
||||||
|
# You need a local checkout of llama-stack to run this, get it using
|
||||||
|
# git clone https://github.com/meta-llama/llama-stack.git
|
||||||
|
cd /path/to/llama-stack
|
||||||
|
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
|
||||||
--pull always \
|
--pull always \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
-v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \
|
||||||
llamastack/distribution-remote-vllm \
|
llamastack/distribution-remote-vllm \
|
||||||
--yaml-config /root/my-run.yaml \
|
--yaml-config /root/my-run.yaml \
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
|
@ -121,7 +124,6 @@ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
cd /path/to/llama-stack
|
cd /path/to/llama-stack
|
||||||
|
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
|
||||||
--pull always \
|
--pull always \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ~/.llama:/root/.llama \
|
-v ~/.llama:/root/.llama \
|
||||||
|
|
32
docs/source/distributions/starting_llama_stack_server.md
Normal file
32
docs/source/distributions/starting_llama_stack_server.md
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
# Starting a Llama Stack Server
|
||||||
|
|
||||||
|
You can run a Llama Stack server in one of the following ways:
|
||||||
|
|
||||||
|
**As a Library**:
|
||||||
|
|
||||||
|
This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
|
||||||
|
|
||||||
|
|
||||||
|
**Container**:
|
||||||
|
|
||||||
|
Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
|
||||||
|
|
||||||
|
|
||||||
|
**Conda**:
|
||||||
|
|
||||||
|
If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
|
||||||
|
|
||||||
|
|
||||||
|
**Kubernetes**:
|
||||||
|
|
||||||
|
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
|
||||||
|
|
||||||
|
|
||||||
|
```{toctree}
|
||||||
|
:maxdepth: 1
|
||||||
|
:hidden:
|
||||||
|
|
||||||
|
importing_as_library
|
||||||
|
configuration
|
||||||
|
kubernetes_deployment
|
||||||
|
```
|
|
@ -1,10 +1,11 @@
|
||||||
# Quick Start
|
# Quick Start
|
||||||
|
|
||||||
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple RAG agent.
|
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to build a simple [RAG (Retrieval Augmented Generation)](../building_applications/rag.md) agent.
|
||||||
|
|
||||||
A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.
|
A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.
|
||||||
|
|
||||||
In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
|
In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
|
||||||
|
Ollama is an LLM runtime that allows you to run Llama models locally.
|
||||||
|
|
||||||
|
|
||||||
### 1. Start Ollama
|
### 1. Start Ollama
|
||||||
|
@ -24,7 +25,7 @@ If you do not have ollama, you can install it from [here](https://ollama.com/dow
|
||||||
|
|
||||||
### 2. Pick a client environment
|
### 2. Pick a client environment
|
||||||
|
|
||||||
Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through an REST interface. You can interact with the Stack in two ways:
|
Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through a REST interface. You can interact with the Stack in two ways:
|
||||||
|
|
||||||
* Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server.
|
* Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server.
|
||||||
* Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`.
|
* Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`.
|
||||||
|
|
|
@ -6,6 +6,7 @@ Llama Stack {{ llama_stack_version }} is now available! See the {{ llama_stack_v
|
||||||
|
|
||||||
# Llama Stack
|
# Llama Stack
|
||||||
|
|
||||||
|
## What is Llama Stack?
|
||||||
|
|
||||||
Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides
|
Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides
|
||||||
|
|
||||||
|
@ -22,6 +23,12 @@ Llama Stack defines and standardizes the core building blocks needed to bring ge
|
||||||
|
|
||||||
Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.
|
Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.
|
||||||
|
|
||||||
|
## How does Llama Stack work?
|
||||||
|
Llama Stack consists of a [server](./distributions/index.md) (with multiple pluggable API [providers](./providers/index.md)) and [client SDKs](#available-sdks) meant to
|
||||||
|
be used in your applications. The server can be run in a variety of environments, including local (inline)
|
||||||
|
development, on-premises, and cloud. The client SDKs are available for Python, Swift, Node, and
|
||||||
|
Kotlin.
|
||||||
|
|
||||||
## Quick Links
|
## Quick Links
|
||||||
|
|
||||||
- New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
|
- New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
|
||||||
|
@ -93,7 +100,6 @@ getting_started/index
|
||||||
concepts/index
|
concepts/index
|
||||||
providers/index
|
providers/index
|
||||||
distributions/index
|
distributions/index
|
||||||
distributions/selection
|
|
||||||
building_applications/index
|
building_applications/index
|
||||||
playground/index
|
playground/index
|
||||||
contributing/index
|
contributing/index
|
||||||
|
|
|
@ -92,8 +92,6 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
|
||||||
|
|
||||||
## Starting the Llama Stack Playground
|
## Starting the Llama Stack Playground
|
||||||
|
|
||||||
### Llama CLI
|
|
||||||
|
|
||||||
To start the Llama Stack Playground, run the following commands:
|
To start the Llama Stack Playground, run the following commands:
|
||||||
|
|
||||||
1. Start up the Llama Stack API server
|
1. Start up the Llama Stack API server
|
||||||
|
@ -109,29 +107,3 @@ cd llama_stack/distribution/ui
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
streamlit run app.py
|
streamlit run app.py
|
||||||
```
|
```
|
||||||
|
|
||||||
### Docker
|
|
||||||
|
|
||||||
Playground can also be started in a docker image:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
export LLAMA_STACK_URL=http://localhost:11434
|
|
||||||
|
|
||||||
docker run \
|
|
||||||
--pull always \
|
|
||||||
-p 8501:8501 \
|
|
||||||
-e LLAMA_STACK_ENDPOINT=$LLAMA_STACK_URL \
|
|
||||||
quay.io/jland/llama-stack-playground
|
|
||||||
```
|
|
||||||
|
|
||||||
## Configurable Environment Variables
|
|
||||||
|
|
||||||
## Environment Variables
|
|
||||||
|
|
||||||
| Environment Variable | Description | Default Value |
|
|
||||||
|----------------------------|------------------------------------|---------------------------|
|
|
||||||
| LLAMA_STACK_ENDPOINT | The endpoint for the Llama Stack | http://localhost:8321 |
|
|
||||||
| FIREWORKS_API_KEY | API key for Fireworks provider | (empty string) |
|
|
||||||
| TOGETHER_API_KEY | API key for Together provider | (empty string) |
|
|
||||||
| SAMBANOVA_API_KEY | API key for SambaNova provider | (empty string) |
|
|
||||||
| OPENAI_API_KEY | API key for OpenAI provider | (empty string) |
|
|
||||||
|
|
|
@ -10,11 +10,57 @@ That means you're not limited to storing vectors in memory or in a separate serv
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- Lightweight and easy to use
|
- Lightweight and easy to use
|
||||||
- Fully integrated with Llama Stack
|
- Fully integrated with Llama Stacks
|
||||||
|
- Uses disk-based storage for persistence, allowing for larger vector storage
|
||||||
|
|
||||||
|
### Comparison to Faiss
|
||||||
|
|
||||||
|
The choice between Faiss and sqlite-vec should be made based on the needs of your application,
|
||||||
|
as they have different strengths.
|
||||||
|
|
||||||
|
#### Choosing the Right Provider
|
||||||
|
|
||||||
|
Scenario | Recommended Tool | Reason
|
||||||
|
-- |-----------------| --
|
||||||
|
Online Analytical Processing (OLAP) | Faiss | Fast, in-memory searches
|
||||||
|
Online Transaction Processing (OLTP) | sqlite-vec | Frequent writes and reads
|
||||||
|
Frequent writes | sqlite-vec | Efficient disk-based storage and incremental indexing
|
||||||
|
Large datasets | sqlite-vec | Disk-based storage for larger vector storage
|
||||||
|
Datasets that can fit in memory, frequent reads | Faiss | Optimized for speed, indexing, and GPU acceleration
|
||||||
|
|
||||||
|
#### Empirical Example
|
||||||
|
|
||||||
|
Consider the histogram below in which 10,000 randomly generated strings were inserted
|
||||||
|
in batches of 100 into both Faiss and sqlite-vec using `client.tool_runtime.rag_tool.insert()`.
|
||||||
|
|
||||||
|
```{image} ../../../../_static/providers/vector_io/write_time_comparison_sqlite-vec-faiss.png
|
||||||
|
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||||
|
:width: 400px
|
||||||
|
```
|
||||||
|
|
||||||
|
You will notice that the average write time for `sqlite-vec` was 788ms, compared to
|
||||||
|
47,640ms for Faiss. While the number is jarring, if you look at the distribution, you can see that it is rather
|
||||||
|
uniformly spread across the [1500, 100000] interval.
|
||||||
|
|
||||||
|
Looking at each individual write in the order that the documents are inserted you'll see the increase in
|
||||||
|
write speed as Faiss reindexes the vectors after each write.
|
||||||
|
```{image} ../../../../_static/providers/vector_io/write_time_sequence_sqlite-vec-faiss.png
|
||||||
|
:alt: Comparison of SQLite-Vec and Faiss write times
|
||||||
|
:width: 400px
|
||||||
|
```
|
||||||
|
|
||||||
|
In comparison, the read times for Faiss was on average 10% faster than sqlite-vec.
|
||||||
|
The modes of the two distributions highlight the differences much further where Faiss
|
||||||
|
will likely yield faster read performance.
|
||||||
|
|
||||||
|
```{image} ../../../../_static/providers/vector_io/read_time_comparison_sqlite-vec-faiss.png
|
||||||
|
:alt: Comparison of SQLite-Vec and Faiss read times
|
||||||
|
:width: 400px
|
||||||
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
To use SQLite-Vec in your Llama Stack project, follow these steps:
|
To use sqlite-vec in your Llama Stack project, follow these steps:
|
||||||
|
|
||||||
1. Install the necessary dependencies.
|
1. Install the necessary dependencies.
|
||||||
2. Configure your Llama Stack project to use SQLite-Vec.
|
2. Configure your Llama Stack project to use SQLite-Vec.
|
||||||
|
|
|
@ -15,6 +15,7 @@ class JobStatus(Enum):
|
||||||
in_progress = "in_progress"
|
in_progress = "in_progress"
|
||||||
failed = "failed"
|
failed = "failed"
|
||||||
scheduled = "scheduled"
|
scheduled = "scheduled"
|
||||||
|
cancelled = "cancelled"
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
|
23
llama_stack/apis/common/responses.py
Normal file
23
llama_stack/apis/common/responses.py
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from llama_stack.schema_utils import json_schema_type
|
||||||
|
|
||||||
|
|
||||||
|
@json_schema_type
|
||||||
|
class PaginatedResponse(BaseModel):
|
||||||
|
"""A generic paginated response that follows a simple format.
|
||||||
|
|
||||||
|
:param data: The list of items for the current page
|
||||||
|
:param has_more: Whether there are more items available after this set
|
||||||
|
"""
|
||||||
|
|
||||||
|
data: List[Dict[str, Any]]
|
||||||
|
has_more: bool
|
|
@ -6,23 +6,9 @@
|
||||||
|
|
||||||
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
|
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from llama_stack.apis.common.responses import PaginatedResponse
|
||||||
|
|
||||||
from llama_stack.apis.datasets import Dataset
|
from llama_stack.apis.datasets import Dataset
|
||||||
from llama_stack.schema_utils import json_schema_type, webmethod
|
from llama_stack.schema_utils import webmethod
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class IterrowsResponse(BaseModel):
|
|
||||||
"""
|
|
||||||
A paginated list of rows from a dataset.
|
|
||||||
|
|
||||||
:param data: The rows in the current page.
|
|
||||||
:param next_start_index: Index into dataset for the first row in the next page. None if there are no more rows.
|
|
||||||
"""
|
|
||||||
|
|
||||||
data: List[Dict[str, Any]]
|
|
||||||
next_start_index: Optional[int] = None
|
|
||||||
|
|
||||||
|
|
||||||
class DatasetStore(Protocol):
|
class DatasetStore(Protocol):
|
||||||
|
@ -34,15 +20,22 @@ class DatasetIO(Protocol):
|
||||||
# keeping for aligning with inference/safety, but this is not used
|
# keeping for aligning with inference/safety, but this is not used
|
||||||
dataset_store: DatasetStore
|
dataset_store: DatasetStore
|
||||||
|
|
||||||
# TODO(xiyan): there's a flakiness here where setting route to "/datasets/" here will not result in proper routing
|
|
||||||
@webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET")
|
@webmethod(route="/datasetio/iterrows/{dataset_id:path}", method="GET")
|
||||||
async def iterrows(
|
async def iterrows(
|
||||||
self,
|
self,
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
start_index: Optional[int] = None,
|
start_index: Optional[int] = None,
|
||||||
limit: Optional[int] = None,
|
limit: Optional[int] = None,
|
||||||
) -> IterrowsResponse:
|
) -> PaginatedResponse:
|
||||||
"""Get a paginated list of rows from a dataset. Uses cursor-based pagination.
|
"""Get a paginated list of rows from a dataset.
|
||||||
|
|
||||||
|
Uses offset-based pagination where:
|
||||||
|
- start_index: The starting index (0-based). If None, starts from beginning.
|
||||||
|
- limit: Number of items to return. If None or -1, returns all items.
|
||||||
|
|
||||||
|
The response includes:
|
||||||
|
- data: List of items for the current page
|
||||||
|
- has_more: Whether there are more items available after this set
|
||||||
|
|
||||||
:param dataset_id: The ID of the dataset to get the rows from.
|
:param dataset_id: The ID of the dataset to get the rows from.
|
||||||
:param start_index: Index into dataset for the first row to get. Get all rows if None.
|
:param start_index: Index into dataset for the first row to get. Get all rows if None.
|
||||||
|
|
|
@ -34,6 +34,7 @@ class Api(Enum):
|
||||||
scoring_functions = "scoring_functions"
|
scoring_functions = "scoring_functions"
|
||||||
benchmarks = "benchmarks"
|
benchmarks = "benchmarks"
|
||||||
tool_groups = "tool_groups"
|
tool_groups = "tool_groups"
|
||||||
|
files = "files"
|
||||||
|
|
||||||
# built-in API
|
# built-in API
|
||||||
inspect = "inspect"
|
inspect = "inspect"
|
||||||
|
|
|
@ -164,7 +164,7 @@ class Files(Protocol):
|
||||||
self,
|
self,
|
||||||
bucket: str,
|
bucket: str,
|
||||||
key: str,
|
key: str,
|
||||||
) -> FileResponse:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Delete a file identified by a bucket and key.
|
Delete a file identified by a bucket and key.
|
||||||
|
|
||||||
|
|
|
@ -88,6 +88,10 @@ class ListToolsResponse(BaseModel):
|
||||||
data: List[Tool]
|
data: List[Tool]
|
||||||
|
|
||||||
|
|
||||||
|
class ListToolDefsResponse(BaseModel):
|
||||||
|
data: list[ToolDef]
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
@trace_protocol
|
@trace_protocol
|
||||||
class ToolGroups(Protocol):
|
class ToolGroups(Protocol):
|
||||||
|
@ -148,7 +152,7 @@ class ToolRuntime(Protocol):
|
||||||
@webmethod(route="/tool-runtime/list-tools", method="GET")
|
@webmethod(route="/tool-runtime/list-tools", method="GET")
|
||||||
async def list_runtime_tools(
|
async def list_runtime_tools(
|
||||||
self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
|
self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
|
||||||
) -> List[ToolDef]: ...
|
) -> ListToolDefsResponse: ...
|
||||||
|
|
||||||
@webmethod(route="/tool-runtime/invoke", method="POST")
|
@webmethod(route="/tool-runtime/invoke", method="POST")
|
||||||
async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
|
async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
|
||||||
|
|
|
@ -21,6 +21,7 @@ from prompt_toolkit.completion import WordCompleter
|
||||||
from prompt_toolkit.validation import Validator
|
from prompt_toolkit.validation import Validator
|
||||||
from termcolor import cprint
|
from termcolor import cprint
|
||||||
|
|
||||||
|
from llama_stack.cli.stack.utils import ImageType
|
||||||
from llama_stack.cli.table import print_table
|
from llama_stack.cli.table import print_table
|
||||||
from llama_stack.distribution.build import (
|
from llama_stack.distribution.build import (
|
||||||
SERVER_DEPENDENCIES,
|
SERVER_DEPENDENCIES,
|
||||||
|
@ -62,10 +63,10 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
|
||||||
if args.list_templates:
|
if args.list_templates:
|
||||||
return _run_template_list_cmd()
|
return _run_template_list_cmd()
|
||||||
|
|
||||||
if args.image_type == "venv":
|
if args.image_type == ImageType.VENV.value:
|
||||||
current_venv = os.environ.get("VIRTUAL_ENV")
|
current_venv = os.environ.get("VIRTUAL_ENV")
|
||||||
image_name = args.image_name or current_venv
|
image_name = args.image_name or current_venv
|
||||||
elif args.image_type == "conda":
|
elif args.image_type == ImageType.CONDA.value:
|
||||||
current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
|
current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
|
||||||
image_name = args.image_name or current_conda_env
|
image_name = args.image_name or current_conda_env
|
||||||
else:
|
else:
|
||||||
|
@ -84,7 +85,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
|
||||||
build_config.image_type = args.image_type
|
build_config.image_type = args.image_type
|
||||||
else:
|
else:
|
||||||
cprint(
|
cprint(
|
||||||
f"Please specify a image-type (container | conda | venv) for {args.template}",
|
f"Please specify a image-type ({' | '.join(e.value for e in ImageType)}) for {args.template}",
|
||||||
color="red",
|
color="red",
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -98,15 +99,15 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
image_type = prompt(
|
image_type = prompt(
|
||||||
"> Enter the image type you want your Llama Stack to be built as (container or conda or venv): ",
|
f"> Enter the image type you want your Llama Stack to be built as ({' or '.join(e.value for e in ImageType)}): ",
|
||||||
validator=Validator.from_callable(
|
validator=Validator.from_callable(
|
||||||
lambda x: x in ["container", "conda", "venv"],
|
lambda x: x in [e.value for e in ImageType],
|
||||||
error_message="Invalid image type, please enter conda or container or venv",
|
error_message=f"Invalid image type, please enter {' or '.join(e.value for e in ImageType)}",
|
||||||
),
|
),
|
||||||
default="conda",
|
default=ImageType.CONDA.value,
|
||||||
)
|
)
|
||||||
|
|
||||||
if image_type == "conda":
|
if image_type == ImageType.CONDA.value:
|
||||||
if not image_name:
|
if not image_name:
|
||||||
cprint(
|
cprint(
|
||||||
f"No current conda environment detected or specified, will create a new conda environment with the name `llamastack-{name}`",
|
f"No current conda environment detected or specified, will create a new conda environment with the name `llamastack-{name}`",
|
||||||
|
@ -136,6 +137,8 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
|
||||||
providers = dict()
|
providers = dict()
|
||||||
for api, providers_for_api in get_provider_registry().items():
|
for api, providers_for_api in get_provider_registry().items():
|
||||||
available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
|
available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
|
||||||
|
if not available_providers:
|
||||||
|
continue
|
||||||
api_provider = prompt(
|
api_provider = prompt(
|
||||||
"> Enter provider for API {}: ".format(api.value),
|
"> Enter provider for API {}: ".format(api.value),
|
||||||
completer=WordCompleter(available_providers),
|
completer=WordCompleter(available_providers),
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
import argparse
|
import argparse
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
|
from llama_stack.cli.stack.utils import ImageType
|
||||||
from llama_stack.cli.subcommand import Subcommand
|
from llama_stack.cli.subcommand import Subcommand
|
||||||
|
|
||||||
|
|
||||||
|
@ -46,16 +47,16 @@ class StackBuild(Subcommand):
|
||||||
self.parser.add_argument(
|
self.parser.add_argument(
|
||||||
"--image-type",
|
"--image-type",
|
||||||
type=str,
|
type=str,
|
||||||
help="Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config.",
|
help="Image Type to use for the build. If not specified, will use the image type from the template config.",
|
||||||
choices=["conda", "container", "venv"],
|
choices=[e.value for e in ImageType],
|
||||||
default="conda",
|
default=ImageType.CONDA.value,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.parser.add_argument(
|
self.parser.add_argument(
|
||||||
"--image-name",
|
"--image-name",
|
||||||
type=str,
|
type=str,
|
||||||
help=textwrap.dedent(
|
help=textwrap.dedent(
|
||||||
"""[for image-type=conda|venv] Name of the conda or virtual environment to use for
|
f"""[for image-type={"|".join(e.value for e in ImageType)}] Name of the conda or virtual environment to use for
|
||||||
the build. If not specified, currently active Conda environment will be used if found.
|
the build. If not specified, currently active Conda environment will be used if found.
|
||||||
"""
|
"""
|
||||||
),
|
),
|
||||||
|
|
|
@ -8,6 +8,7 @@ import argparse
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from llama_stack.cli.stack.utils import ImageType
|
||||||
from llama_stack.cli.subcommand import Subcommand
|
from llama_stack.cli.subcommand import Subcommand
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
|
|
||||||
|
@ -56,7 +57,6 @@ class StackRun(Subcommand):
|
||||||
"--env",
|
"--env",
|
||||||
action="append",
|
action="append",
|
||||||
help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
|
help="Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.",
|
||||||
default=[],
|
|
||||||
metavar="KEY=VALUE",
|
metavar="KEY=VALUE",
|
||||||
)
|
)
|
||||||
self.parser.add_argument(
|
self.parser.add_argument(
|
||||||
|
@ -73,10 +73,24 @@ class StackRun(Subcommand):
|
||||||
"--image-type",
|
"--image-type",
|
||||||
type=str,
|
type=str,
|
||||||
help="Image Type used during the build. This can be either conda or container or venv.",
|
help="Image Type used during the build. This can be either conda or container or venv.",
|
||||||
choices=["conda", "container", "venv"],
|
choices=[e.value for e in ImageType],
|
||||||
default="conda",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If neither image type nor image name is provided, but at the same time
|
||||||
|
# the current environment has conda breadcrumbs, then assume what the user
|
||||||
|
# wants to use conda mode and not the usual default mode (using
|
||||||
|
# pre-installed system packages).
|
||||||
|
#
|
||||||
|
# Note: yes, this is hacky. It's implemented this way to keep the existing
|
||||||
|
# conda users unaffected by the switch of the default behavior to using
|
||||||
|
# system packages.
|
||||||
|
def _get_image_type_and_name(self, args: argparse.Namespace) -> tuple[str, str]:
|
||||||
|
conda_env = os.environ.get("CONDA_DEFAULT_ENV")
|
||||||
|
if conda_env and args.image_name == conda_env:
|
||||||
|
logger.warning(f"Conda detected. Using conda environment {conda_env} for the run.")
|
||||||
|
return ImageType.CONDA.value, args.image_name
|
||||||
|
return args.image_type, args.image_name
|
||||||
|
|
||||||
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
|
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
|
@ -120,20 +134,44 @@ class StackRun(Subcommand):
|
||||||
except AttributeError as e:
|
except AttributeError as e:
|
||||||
self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
|
self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
|
||||||
|
|
||||||
run_args = formulate_run_args(args.image_type, args.image_name, config, template_name)
|
image_type, image_name = self._get_image_type_and_name(args)
|
||||||
|
|
||||||
run_args.extend([str(config_file), str(args.port)])
|
# If neither image type nor image name is provided, assume the server should be run directly
|
||||||
if args.disable_ipv6:
|
# using the current environment packages.
|
||||||
run_args.append("--disable-ipv6")
|
if not image_type and not image_name:
|
||||||
|
logger.info("No image type or image name provided. Assuming environment packages.")
|
||||||
|
from llama_stack.distribution.server.server import main as server_main
|
||||||
|
|
||||||
for env_var in args.env:
|
# Build the server args from the current args passed to the CLI
|
||||||
if "=" not in env_var:
|
server_args = argparse.Namespace()
|
||||||
self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
|
for arg in vars(args):
|
||||||
key, value = env_var.split("=", 1) # split on first = only
|
# If this is a function, avoid passing it
|
||||||
if not key:
|
# "args" contains:
|
||||||
self.parser.error(f"Environment variable '{env_var}' has empty key")
|
# func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
|
||||||
run_args.extend(["--env", f"{key}={value}"])
|
if callable(getattr(args, arg)):
|
||||||
|
continue
|
||||||
|
setattr(server_args, arg, getattr(args, arg))
|
||||||
|
|
||||||
if args.tls_keyfile and args.tls_certfile:
|
# Run the server
|
||||||
run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
|
server_main(server_args)
|
||||||
run_command(run_args)
|
else:
|
||||||
|
run_args = formulate_run_args(image_type, image_name, config, template_name)
|
||||||
|
|
||||||
|
run_args.extend([str(config_file), str(args.port)])
|
||||||
|
if args.disable_ipv6:
|
||||||
|
run_args.append("--disable-ipv6")
|
||||||
|
|
||||||
|
if args.env:
|
||||||
|
for env_var in args.env:
|
||||||
|
if "=" not in env_var:
|
||||||
|
self.parser.error(f"Environment variable '{env_var}' must be in KEY=VALUE format")
|
||||||
|
return
|
||||||
|
key, value = env_var.split("=", 1) # split on first = only
|
||||||
|
if not key:
|
||||||
|
self.parser.error(f"Environment variable '{env_var}' has empty key")
|
||||||
|
return
|
||||||
|
run_args.extend(["--env", f"{key}={value}"])
|
||||||
|
|
||||||
|
if args.tls_keyfile and args.tls_certfile:
|
||||||
|
run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
|
||||||
|
run_command(run_args)
|
||||||
|
|
|
@ -4,6 +4,14 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class ImageType(Enum):
|
||||||
|
CONDA = "conda"
|
||||||
|
CONTAINER = "container"
|
||||||
|
VENV = "venv"
|
||||||
|
|
||||||
|
|
||||||
def print_subcommand_description(parser, subparsers):
|
def print_subcommand_description(parser, subparsers):
|
||||||
"""Print descriptions of subcommands."""
|
"""Print descriptions of subcommands."""
|
||||||
|
|
|
@ -328,8 +328,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
||||||
|
|
||||||
body = self._convert_body(path, options.method, body)
|
body = self._convert_body(path, options.method, body)
|
||||||
|
|
||||||
|
await start_trace(route, {"__location__": "library_client"})
|
||||||
|
|
||||||
async def gen():
|
async def gen():
|
||||||
await start_trace(route, {"__location__": "library_client"})
|
|
||||||
try:
|
try:
|
||||||
async for chunk in await func(**body):
|
async for chunk in await func(**body):
|
||||||
data = json.dumps(convert_pydantic_to_json_value(chunk))
|
data = json.dumps(convert_pydantic_to_json_value(chunk))
|
||||||
|
|
|
@ -12,6 +12,7 @@ from llama_stack.apis.benchmarks import Benchmarks
|
||||||
from llama_stack.apis.datasetio import DatasetIO
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Datasets
|
from llama_stack.apis.datasets import Datasets
|
||||||
from llama_stack.apis.eval import Eval
|
from llama_stack.apis.eval import Eval
|
||||||
|
from llama_stack.apis.files import Files
|
||||||
from llama_stack.apis.inference import Inference
|
from llama_stack.apis.inference import Inference
|
||||||
from llama_stack.apis.inspect import Inspect
|
from llama_stack.apis.inspect import Inspect
|
||||||
from llama_stack.apis.models import Models
|
from llama_stack.apis.models import Models
|
||||||
|
@ -79,6 +80,7 @@ def api_protocol_map() -> Dict[Api, Any]:
|
||||||
Api.post_training: PostTraining,
|
Api.post_training: PostTraining,
|
||||||
Api.tool_groups: ToolGroups,
|
Api.tool_groups: ToolGroups,
|
||||||
Api.tool_runtime: ToolRuntime,
|
Api.tool_runtime: ToolRuntime,
|
||||||
|
Api.files: Files,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,8 @@ from llama_stack.apis.common.content_types import (
|
||||||
InterleavedContent,
|
InterleavedContent,
|
||||||
InterleavedContentItem,
|
InterleavedContentItem,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
|
from llama_stack.apis.common.responses import PaginatedResponse
|
||||||
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import DatasetPurpose, DataSource
|
from llama_stack.apis.datasets import DatasetPurpose, DataSource
|
||||||
from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
|
from llama_stack.apis.eval import BenchmarkConfig, Eval, EvaluateResponse, Job
|
||||||
from llama_stack.apis.inference import (
|
from llama_stack.apis.inference import (
|
||||||
|
@ -45,11 +46,11 @@ from llama_stack.apis.scoring import (
|
||||||
from llama_stack.apis.shields import Shield
|
from llama_stack.apis.shields import Shield
|
||||||
from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
|
from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
|
||||||
from llama_stack.apis.tools import (
|
from llama_stack.apis.tools import (
|
||||||
|
ListToolDefsResponse,
|
||||||
RAGDocument,
|
RAGDocument,
|
||||||
RAGQueryConfig,
|
RAGQueryConfig,
|
||||||
RAGQueryResult,
|
RAGQueryResult,
|
||||||
RAGToolRuntime,
|
RAGToolRuntime,
|
||||||
ToolDef,
|
|
||||||
ToolRuntime,
|
ToolRuntime,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
|
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
|
||||||
|
@ -497,7 +498,7 @@ class DatasetIORouter(DatasetIO):
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
start_index: Optional[int] = None,
|
start_index: Optional[int] = None,
|
||||||
limit: Optional[int] = None,
|
limit: Optional[int] = None,
|
||||||
) -> IterrowsResponse:
|
) -> PaginatedResponse:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}",
|
f"DatasetIORouter.iterrows: {dataset_id}, {start_index=} {limit=}",
|
||||||
)
|
)
|
||||||
|
@ -706,6 +707,6 @@ class ToolRuntimeRouter(ToolRuntime):
|
||||||
|
|
||||||
async def list_runtime_tools(
|
async def list_runtime_tools(
|
||||||
self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
|
self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
|
||||||
) -> List[ToolDef]:
|
) -> ListToolDefsResponse:
|
||||||
logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
|
logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
|
||||||
return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint)
|
return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint)
|
||||||
|
|
|
@ -568,7 +568,7 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups):
|
||||||
tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint)
|
tool_defs = await self.impls_by_provider_id[provider_id].list_runtime_tools(toolgroup_id, mcp_endpoint)
|
||||||
tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
|
tool_host = ToolHost.model_context_protocol if mcp_endpoint else ToolHost.distribution
|
||||||
|
|
||||||
for tool_def in tool_defs:
|
for tool_def in tool_defs.data:
|
||||||
tools.append(
|
tools.append(
|
||||||
ToolWithACL(
|
ToolWithACL(
|
||||||
identifier=tool_def.name,
|
identifier=tool_def.name,
|
||||||
|
|
|
@ -15,7 +15,7 @@ import warnings
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from importlib.metadata import version as parse_version
|
from importlib.metadata import version as parse_version
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List, Union
|
from typing import Any, List, Optional, Union
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from fastapi import Body, FastAPI, HTTPException, Request
|
from fastapi import Body, FastAPI, HTTPException, Request
|
||||||
|
@ -294,11 +294,17 @@ class ClientVersionMiddleware:
|
||||||
return await self.app(scope, receive, send)
|
return await self.app(scope, receive, send)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main(args: Optional[argparse.Namespace] = None):
|
||||||
"""Start the LlamaStack server."""
|
"""Start the LlamaStack server."""
|
||||||
parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
|
parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--yaml-config",
|
"--yaml-config",
|
||||||
|
dest="config",
|
||||||
|
help="(Deprecated) Path to YAML configuration file - use --config instead",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
dest="config",
|
||||||
help="Path to YAML configuration file",
|
help="Path to YAML configuration file",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -328,12 +334,24 @@ def main():
|
||||||
required="--tls-keyfile" in sys.argv,
|
required="--tls-keyfile" in sys.argv,
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
# Determine whether the server args are being passed by the "run" command, if this is the case
|
||||||
|
# the args will be passed as a Namespace object to the main function, otherwise they will be
|
||||||
|
# parsed from the command line
|
||||||
|
if args is None:
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Check for deprecated argument usage
|
||||||
|
if "--yaml-config" in sys.argv:
|
||||||
|
warnings.warn(
|
||||||
|
"The '--yaml-config' argument is deprecated and will be removed in a future version. Use '--config' instead.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
log_line = ""
|
log_line = ""
|
||||||
if args.yaml_config:
|
if args.config:
|
||||||
# if the user provided a config file, use it, even if template was specified
|
# if the user provided a config file, use it, even if template was specified
|
||||||
config_file = Path(args.yaml_config)
|
config_file = Path(args.config)
|
||||||
if not config_file.exists():
|
if not config_file.exists():
|
||||||
raise ValueError(f"Config file {config_file} does not exist")
|
raise ValueError(f"Config file {config_file} does not exist")
|
||||||
log_line = f"Using config file: {config_file}"
|
log_line = f"Using config file: {config_file}"
|
||||||
|
|
|
@ -13,6 +13,7 @@ LLAMA_CHECKPOINT_DIR=${LLAMA_CHECKPOINT_DIR:-}
|
||||||
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
|
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
|
||||||
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
|
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
|
||||||
PYPI_VERSION=${PYPI_VERSION:-}
|
PYPI_VERSION=${PYPI_VERSION:-}
|
||||||
|
VIRTUAL_ENV=${VIRTUAL_ENV:-}
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
@ -69,22 +70,25 @@ while [[ $# -gt 0 ]]; do
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
PYTHON_BINARY="python"
|
PYTHON_BINARY="python"
|
||||||
case "$env_type" in
|
case "$env_type" in
|
||||||
"venv")
|
"venv")
|
||||||
# Activate virtual environment
|
if [ -n "$VIRTUAL_ENV" && "$VIRTUAL_ENV" == "$env_path_or_name" ]; then
|
||||||
if [ ! -d "$env_path_or_name" ]; then
|
echo -e "${GREEN}Virtual environment already activated${NC}" >&2
|
||||||
echo -e "${RED}Error: Virtual environment not found at $env_path_or_name${NC}" >&2
|
else
|
||||||
exit 1
|
# Activate virtual environment
|
||||||
fi
|
if [ ! -d "$env_path_or_name" ]; then
|
||||||
|
echo -e "${RED}Error: Virtual environment not found at $env_path_or_name${NC}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
if [ ! -f "$env_path_or_name/bin/activate" ]; then
|
if [ ! -f "$env_path_or_name/bin/activate" ]; then
|
||||||
echo -e "${RED}Error: Virtual environment activate binary not found at $env_path_or_name/bin/activate" >&2
|
echo -e "${RED}Error: Virtual environment activate binary not found at $env_path_or_name/bin/activate" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
source "$env_path_or_name/bin/activate"
|
source "$env_path_or_name/bin/activate"
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
"conda")
|
"conda")
|
||||||
if ! is_command_available conda; then
|
if ! is_command_available conda; then
|
||||||
|
|
|
@ -58,6 +58,7 @@ def rag_chat_page():
|
||||||
llama_stack_api.client.tool_runtime.rag_tool.insert(
|
llama_stack_api.client.tool_runtime.rag_tool.insert(
|
||||||
vector_db_id=vector_db_name, # Use the user-provided name
|
vector_db_id=vector_db_name, # Use the user-provided name
|
||||||
documents=documents,
|
documents=documents,
|
||||||
|
chunk_size_in_tokens=512,
|
||||||
)
|
)
|
||||||
st.success("Vector database created successfully!")
|
st.success("Vector database created successfully!")
|
||||||
|
|
||||||
|
|
|
@ -18,15 +18,19 @@ def preserve_contexts_async_generator(
|
||||||
This is needed because we start a new asyncio event loop for each streaming request,
|
This is needed because we start a new asyncio event loop for each streaming request,
|
||||||
and we need to preserve the context across the event loop boundary.
|
and we need to preserve the context across the event loop boundary.
|
||||||
"""
|
"""
|
||||||
|
# Capture initial context values
|
||||||
|
initial_context_values = {context_var.name: context_var.get() for context_var in context_vars}
|
||||||
|
|
||||||
async def wrapper() -> AsyncGenerator[T, None]:
|
async def wrapper() -> AsyncGenerator[T, None]:
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
item = await gen.__anext__()
|
# Restore context values before any await
|
||||||
context_values = {context_var.name: context_var.get() for context_var in context_vars}
|
|
||||||
yield item
|
|
||||||
for context_var in context_vars:
|
for context_var in context_vars:
|
||||||
_ = context_var.set(context_values[context_var.name])
|
context_var.set(initial_context_values[context_var.name])
|
||||||
|
|
||||||
|
item = await gen.__anext__()
|
||||||
|
yield item
|
||||||
|
|
||||||
except StopAsyncIteration:
|
except StopAsyncIteration:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
@ -139,7 +139,7 @@ def setup_logging(category_levels: Dict[str, int], log_file: str | None) -> None
|
||||||
category_levels (Dict[str, int]): A dictionary mapping categories to their log levels.
|
category_levels (Dict[str, int]): A dictionary mapping categories to their log levels.
|
||||||
log_file (str): Path to a log file to additionally pipe the logs into
|
log_file (str): Path to a log file to additionally pipe the logs into
|
||||||
"""
|
"""
|
||||||
log_format = "[dim]%(asctime)s %(name)s:%(lineno)d[/] [yellow dim]%(category)s[/]: %(message)s"
|
log_format = "%(asctime)s %(name)s:%(lineno)d %(category)s: %(message)s"
|
||||||
|
|
||||||
class CategoryFilter(logging.Filter):
|
class CategoryFilter(logging.Filter):
|
||||||
"""Ensure category is always present in log records."""
|
"""Ensure category is always present in log records."""
|
||||||
|
|
|
@ -195,10 +195,22 @@ register_schema(SamplingStrategy, name="SamplingStrategy")
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class SamplingParams(BaseModel):
|
class SamplingParams(BaseModel):
|
||||||
|
"""Sampling parameters.
|
||||||
|
|
||||||
|
:param strategy: The sampling strategy.
|
||||||
|
:param max_tokens: The maximum number of tokens that can be generated in the completion. The token count of
|
||||||
|
your prompt plus max_tokens cannot exceed the model's context length.
|
||||||
|
:param repetition_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens
|
||||||
|
based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
|
||||||
|
:param stop: Up to 4 sequences where the API will stop generating further tokens.
|
||||||
|
The returned text will not contain the stop sequence.
|
||||||
|
"""
|
||||||
|
|
||||||
strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)
|
strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy)
|
||||||
|
|
||||||
max_tokens: Optional[int] = 0
|
max_tokens: Optional[int] = 0
|
||||||
repetition_penalty: Optional[float] = 1.0
|
repetition_penalty: Optional[float] = 1.0
|
||||||
|
stop: Optional[List[str]] = None
|
||||||
|
|
||||||
|
|
||||||
class CheckpointQuantizationFormat(Enum):
|
class CheckpointQuantizationFormat(Enum):
|
||||||
|
|
|
@ -57,11 +57,7 @@ from llama_stack.apis.inference import (
|
||||||
UserMessage,
|
UserMessage,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.safety import Safety
|
from llama_stack.apis.safety import Safety
|
||||||
from llama_stack.apis.tools import (
|
from llama_stack.apis.tools import ToolGroups, ToolInvocationResult, ToolRuntime
|
||||||
ToolGroups,
|
|
||||||
ToolInvocationResult,
|
|
||||||
ToolRuntime,
|
|
||||||
)
|
|
||||||
from llama_stack.apis.vector_io import VectorIO
|
from llama_stack.apis.vector_io import VectorIO
|
||||||
from llama_stack.log import get_logger
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.models.llama.datatypes import (
|
from llama_stack.models.llama.datatypes import (
|
||||||
|
@ -459,7 +455,15 @@ class ChatAgent(ShieldRunnerMixin):
|
||||||
contexts.append(raw_document_text)
|
contexts.append(raw_document_text)
|
||||||
|
|
||||||
attached_context = "\n".join(contexts)
|
attached_context = "\n".join(contexts)
|
||||||
input_messages[-1].context = attached_context
|
if isinstance(input_messages[-1].content, str):
|
||||||
|
input_messages[-1].content += attached_context
|
||||||
|
elif isinstance(input_messages[-1].content, list):
|
||||||
|
input_messages[-1].content.append(TextContentItem(text=attached_context))
|
||||||
|
else:
|
||||||
|
input_messages[-1].content = [
|
||||||
|
input_messages[-1].content,
|
||||||
|
TextContentItem(text=attached_context),
|
||||||
|
]
|
||||||
|
|
||||||
session_info = await self.storage.get_session_info(session_id)
|
session_info = await self.storage.get_session_info(session_id)
|
||||||
# if the session has a memory bank id, let the memory tool use it
|
# if the session has a memory bank id, let the memory tool use it
|
||||||
|
|
|
@ -7,9 +7,11 @@ from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
import pandas
|
import pandas
|
||||||
|
|
||||||
from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse
|
from llama_stack.apis.common.responses import PaginatedResponse
|
||||||
|
from llama_stack.apis.datasetio import DatasetIO
|
||||||
from llama_stack.apis.datasets import Dataset
|
from llama_stack.apis.datasets import Dataset
|
||||||
from llama_stack.providers.datatypes import DatasetsProtocolPrivate
|
from llama_stack.providers.datatypes import DatasetsProtocolPrivate
|
||||||
|
from llama_stack.providers.utils.datasetio.pagination import paginate_records
|
||||||
from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
|
from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
|
||||||
from llama_stack.providers.utils.kvstore import kvstore_impl
|
from llama_stack.providers.utils.kvstore import kvstore_impl
|
||||||
|
|
||||||
|
@ -92,24 +94,13 @@ class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
start_index: Optional[int] = None,
|
start_index: Optional[int] = None,
|
||||||
limit: Optional[int] = None,
|
limit: Optional[int] = None,
|
||||||
) -> IterrowsResponse:
|
) -> PaginatedResponse:
|
||||||
dataset_def = self.dataset_infos[dataset_id]
|
dataset_def = self.dataset_infos[dataset_id]
|
||||||
dataset_impl = PandasDataframeDataset(dataset_def)
|
dataset_impl = PandasDataframeDataset(dataset_def)
|
||||||
await dataset_impl.load()
|
await dataset_impl.load()
|
||||||
|
|
||||||
start_index = start_index or 0
|
records = dataset_impl.df.to_dict("records")
|
||||||
|
return paginate_records(records, start_index, limit)
|
||||||
if limit is None or limit == -1:
|
|
||||||
end = len(dataset_impl)
|
|
||||||
else:
|
|
||||||
end = min(start_index + limit, len(dataset_impl))
|
|
||||||
|
|
||||||
rows = dataset_impl[start_index:end]
|
|
||||||
|
|
||||||
return IterrowsResponse(
|
|
||||||
data=rows,
|
|
||||||
next_start_index=end if end < len(dataset_impl) else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
|
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
|
||||||
dataset_def = self.dataset_infos[dataset_id]
|
dataset_def = self.dataset_infos[dataset_id]
|
||||||
|
|
|
@ -28,6 +28,11 @@ class TelemetryConfig(BaseModel):
|
||||||
default="http://localhost:4318/v1/metrics",
|
default="http://localhost:4318/v1/metrics",
|
||||||
description="The OpenTelemetry collector endpoint URL for metrics",
|
description="The OpenTelemetry collector endpoint URL for metrics",
|
||||||
)
|
)
|
||||||
|
service_name: str = Field(
|
||||||
|
# service name is always the same, use zero-width space to avoid clutter
|
||||||
|
default="",
|
||||||
|
description="The service name to use for telemetry",
|
||||||
|
)
|
||||||
sinks: List[TelemetrySink] = Field(
|
sinks: List[TelemetrySink] = Field(
|
||||||
default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE],
|
default=[TelemetrySink.CONSOLE, TelemetrySink.SQLITE],
|
||||||
description="List of telemetry sinks to enable (possible values: otel, sqlite, console)",
|
description="List of telemetry sinks to enable (possible values: otel, sqlite, console)",
|
||||||
|
@ -47,6 +52,7 @@ class TelemetryConfig(BaseModel):
|
||||||
@classmethod
|
@classmethod
|
||||||
def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
|
def sample_run_config(cls, __distro_dir__: str, db_name: str = "trace_store.db") -> Dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
|
"service_name": "${env.OTEL_SERVICE_NAME:}",
|
||||||
"sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
|
"sinks": "${env.TELEMETRY_SINKS:console,sqlite}",
|
||||||
"sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
|
"sqlite_db_path": "${env.SQLITE_DB_PATH:" + __distro_dir__ + "/" + db_name + "}",
|
||||||
}
|
}
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue