Update more distribution docs to be simpler and partially codegen'ed

This commit is contained in:
Ashwin Bharambe 2024-11-20 14:44:04 -08:00
parent e84d4436b5
commit 2411a44833
51 changed files with 1188 additions and 291 deletions

View file

@ -1,45 +0,0 @@
version: '2'
image_name: local
name: bedrock
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: bedrock0
provider_type: remote::bedrock
config:
aws_access_key_id: <AWS_ACCESS_KEY_ID>
aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
aws_session_token: <AWS_SESSION_TOKEN>
region_name: <AWS_REGION>
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
safety:
- provider_id: bedrock0
provider_type: remote::bedrock
config:
aws_access_key_id: <AWS_ACCESS_KEY_ID>
aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
aws_session_token: <AWS_SESSION_TOKEN>
region_name: <AWS_REGION>
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -0,0 +1 @@
../../llama_stack/templates/bedrock/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/databricks/build.yaml

View file

@ -1,4 +1,32 @@
{ {
"hf-serverless": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"together": [ "together": [
"aiosqlite", "aiosqlite",
"blobfile", "blobfile",
@ -26,6 +54,33 @@
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch --index-url https://download.pytorch.org/whl/cpu"
], ],
"vllm-gpu": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"vllm",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"remote-vllm": [ "remote-vllm": [
"aiosqlite", "aiosqlite",
"blobfile", "blobfile",
@ -108,6 +163,33 @@
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch --index-url https://download.pytorch.org/whl/cpu"
], ],
"bedrock": [
"aiosqlite",
"blobfile",
"boto3",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"meta-reference-gpu": [ "meta-reference-gpu": [
"accelerate", "accelerate",
"aiosqlite", "aiosqlite",
@ -167,5 +249,33 @@
"uvicorn", "uvicorn",
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch --index-url https://download.pytorch.org/whl/cpu"
],
"hf-endpoint": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
] ]
} }

View file

@ -1 +0,0 @@
../../llama_stack/templates/hf-endpoint/build.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/hf-serverless/build.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/ollama/build.yaml

View file

@ -1,48 +0,0 @@
services:
ollama:
image: ollama/ollama:latest
network_mode: "host"
volumes:
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
ports:
- "11434:11434"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
- ollama
image: llamastack/distribution-ollama
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ./run.yaml:/root/llamastack-run-ollama.yaml
ports:
- "5000:5000"
# Hack: wait for ollama server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
volumes:
ollama:

View file

@ -1,46 +0,0 @@
version: '2'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: ollama
provider_type: remote::ollama
config:
url: ${env.OLLAMA_URL:http://127.0.0.1:11434}
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
excluded_categories: []
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
models:
- model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
provider_id: ollama
- model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
provider_id: ollama
shields:
- shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}

View file

@ -6,59 +6,58 @@
self self
``` ```
### Connect to a Llama Stack Bedrock Endpoint The `llamastack/distribution-bedrock` distribution consists of the following provider configurations:
- You may connect to Amazon Bedrock APIs for running LLM inference
The `llamastack/distribution-bedrock` distribution consists of the following provider configurations. | API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| inference | `remote::bedrock` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `remote::bedrock` |
| telemetry | `inline::meta-reference` |
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |--------------- |---------------- |---------------- |---------------- |---------------- | ### Environment Variables
| **Provider(s)** | remote::bedrock | meta-reference | meta-reference | remote::bedrock | meta-reference |
The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
### Docker: Start the Distribution (Single Node CPU)
> [!NOTE] ### Prerequisite: API Keys
> This assumes you have valid AWS credentials configured with access to Amazon Bedrock.
``` Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
$ cd distributions/bedrock && docker compose up
## Running Llama Stack with AWS Bedrock
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-bedrock \
--port $LLAMA_STACK_PORT \
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
``` ```
Make sure in your `run.yaml` file, your inference provider is pointing to the correct AWS configuration. E.g. ### Via Conda
```
inference:
- provider_id: bedrock0
provider_type: remote::bedrock
config:
aws_access_key_id: <AWS_ACCESS_KEY_ID>
aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
aws_session_token: <AWS_SESSION_TOKEN>
region_name: <AWS_REGION>
```
### Conda llama stack run (Single Node CPU)
```bash ```bash
llama stack build --template bedrock --image-type conda llama stack build --template bedrock --image-type conda
# -- modify run.yaml with valid AWS credentials llama stack run ./run.yaml \
llama stack run ./run.yaml --port $LLAMA_STACK_PORT \
``` --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
### (Optional) Update Model Serving Configuration --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
Use `llama-stack-client models list` to check the available models served by Amazon Bedrock.
```
$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
| identifier | llama_model | provider_id | metadata |
+==============================+==============================+===============+============+
| Llama3.1-8B-Instruct | meta.llama3-1-8b-instruct-v1:0 | bedrock0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-70B-Instruct | meta.llama3-1-70b-instruct-v1:0 | bedrock0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-405B-Instruct | meta.llama3-1-405b-instruct-v1:0 | bedrock0 | {} |
+------------------------------+------------------------------+---------------+------------+
``` ```

View file

@ -58,9 +58,7 @@ LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-fireworks \ llamastack/distribution-fireworks \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
``` ```
@ -70,6 +68,6 @@ docker run \
```bash ```bash
llama stack build --template fireworks --image-type conda llama stack build --template fireworks --image-type conda
llama stack run ./run.yaml \ llama stack run ./run.yaml \
--port 5001 \ --port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
``` ```

View file

@ -54,9 +54,7 @@ LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-meta-reference-gpu \ llamastack/distribution-meta-reference-gpu \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
``` ```
@ -67,9 +65,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-meta-reference-gpu \ llamastack/distribution-meta-reference-gpu \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@ -81,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash ```bash
llama stack build --template meta-reference-gpu --image-type conda llama stack build --template meta-reference-gpu --image-type conda
llama stack run ./run.yaml \ llama stack run distributions/meta-reference-gpu/run.yaml \
--port 5001 \ --port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
``` ```
@ -89,7 +85,7 @@ llama stack run ./run.yaml \
If you are using Llama Stack Safety / Shield APIs, use: If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
llama stack run ./run-with-safety.yaml \ llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
--port 5001 \ --port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

View file

@ -66,9 +66,7 @@ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \ -v ~/.llama:/root/.llama \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-ollama \ llamastack/distribution-ollama \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434 --env OLLAMA_URL=http://host.docker.internal:11434

View file

@ -85,9 +85,7 @@ LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-tgi \ llamastack/distribution-tgi \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@ -116,18 +114,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash ```bash
llama stack build --template tgi --image-type conda llama stack build --template tgi --image-type conda
llama stack run ./run.yaml llama stack run ./run.yaml
--port 5001 --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
``` ```
If you are using Llama Stack Safety / Shield APIs, use: If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
llama stack run ./run-with-safety.yaml llama stack run ./run-with-safety.yaml \
--port 5001 --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
--env SAFETY_MODEL=$SAFETY_MODEL --env SAFETY_MODEL=$SAFETY_MODEL \
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
``` ```

View file

@ -57,9 +57,7 @@ LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-together \ llamastack/distribution-together \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY --env TOGETHER_API_KEY=$TOGETHER_API_KEY
``` ```
@ -69,6 +67,6 @@ docker run \
```bash ```bash
llama stack build --template together --image-type conda llama stack build --template together --image-type conda
llama stack run ./run.yaml \ llama stack run ./run.yaml \
--port 5001 \ --port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY --env TOGETHER_API_KEY=$TOGETHER_API_KEY
``` ```

View file

@ -37,11 +37,11 @@ class VLLMConfig(BaseModel):
@classmethod @classmethod
def sample_run_config(cls): def sample_run_config(cls):
return { return {
"model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}", "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
"tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}", "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}", "max_tokens": "${env.MAX_TOKENS:4096}",
"enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}", "enforce_eager": "${env.ENFORCE_EAGER:False}",
"gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}", "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
} }
@field_validator("model") @field_validator("model")

View file

@ -4,11 +4,8 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from llama_models.schema_utils import json_schema_type
from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
@json_schema_type
class BedrockConfig(BedrockBaseConfig): class BedrockConfig(BedrockBaseConfig):
pass pass

View file

@ -37,6 +37,18 @@ class InferenceEndpointImplConfig(BaseModel):
description="Your Hugging Face user access token (will default to locally saved token if not provided)", description="Your Hugging Face user access token (will default to locally saved token if not provided)",
) )
@classmethod
def sample_run_config(
cls,
endpoint_name: str = "${env.INFERENCE_ENDPOINT_NAME}",
api_token: str = "${env.HF_API_TOKEN}",
**kwargs,
):
return {
"endpoint_name": endpoint_name,
"api_token": api_token,
}
@json_schema_type @json_schema_type
class InferenceAPIImplConfig(BaseModel): class InferenceAPIImplConfig(BaseModel):
@ -47,3 +59,15 @@ class InferenceAPIImplConfig(BaseModel):
default=None, default=None,
description="Your Hugging Face user access token (will default to locally saved token if not provided)", description="Your Hugging Face user access token (will default to locally saved token if not provided)",
) )
@classmethod
def sample_run_config(
cls,
repo: str = "${env.INFERENCE_MODEL}",
api_token: str = "${env.HF_API_TOKEN}",
**kwargs,
):
return {
"huggingface_repo": repo,
"api_token": api_token,
}

View file

@ -5,11 +5,9 @@
# the root directory of this source tree. # the root directory of this source tree.
from typing import Optional from typing import Optional
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@json_schema_type
class BedrockBaseConfig(BaseModel): class BedrockBaseConfig(BaseModel):
aws_access_key_id: Optional[str] = Field( aws_access_key_id: Optional[str] = Field(
default=None, default=None,
@ -57,3 +55,7 @@ class BedrockBaseConfig(BaseModel):
default=3600, default=3600,
description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).", description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
) )
@classmethod
def sample_run_config(cls, **kwargs):
return {}

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .bedrock import get_distribution_template # noqa: F401

View file

@ -0,0 +1,38 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::bedrock"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["remote::bedrock"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
return DistributionTemplate(
name="bedrock",
distro_type="self_hosted",
description="Use AWS Bedrock for running LLM inference and safety",
docker_image=None,
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[],
run_configs={
"run.yaml": RunConfigSettings(),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
},
)

View file

@ -1,9 +1,19 @@
version: '2'
name: bedrock name: bedrock
distribution_spec: distribution_spec:
description: Use Amazon Bedrock APIs. description: Use AWS Bedrock for running LLM inference and safety
docker_image: null
providers: providers:
inference: remote::bedrock inference:
memory: inline::faiss - remote::bedrock
safety: inline::llama-guard memory:
agents: inline::meta-reference - inline::faiss
telemetry: inline::meta-reference - remote::chromadb
- remote::pgvector
safety:
- remote::bedrock
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,63 @@
# Bedrock Distribution
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
{{ providers_table }}
{% if run_config_env_vars %}
### Environment Variables
The following environment variables can be configured:
{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
{% endfor %}
{% endif %}
{% if default_models %}
### Models
The following models are available by default:
{% for model in default_models %}
- `{{ model.model_id }} ({{ model.provider_model_id }})`
{% endfor %}
{% endif %}
### Prerequisite: API Keys
Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
## Running Llama Stack with AWS Bedrock
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-{{ name }} \
--port $LLAMA_STACK_PORT \
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
```
### Via Conda
```bash
llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \
--port $LLAMA_STACK_PORT \
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
```

View file

@ -0,0 +1,49 @@
version: '2'
image_name: bedrock
docker_image: null
conda_env: bedrock
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: bedrock
provider_type: remote::bedrock
config: {}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/faiss_store.db
safety:
- provider_id: bedrock
provider_type: remote::bedrock
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/registry.db
models: []
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,9 +0,0 @@
name: databricks
distribution_spec:
description: Use Databricks for running LLM inference
providers:
inference: remote::databricks
memory: inline::faiss
safety: inline::llama-guard
agents: meta-reference
telemetry: meta-reference

View file

@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \ llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
``` ```
@ -55,6 +53,6 @@ docker run \
```bash ```bash
llama stack build --template fireworks --image-type conda llama stack build --template fireworks --image-type conda
llama stack run ./run.yaml \ llama stack run ./run.yaml \
--port 5001 \ --port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
``` ```

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .hf_endpoint import get_distribution_template # noqa: F401

View file

@ -1,9 +1,19 @@
version: '2'
name: hf-endpoint name: hf-endpoint
distribution_spec: distribution_spec:
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints." description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
docker_image: null
providers: providers:
inference: remote::hf::endpoint inference:
memory: inline::faiss - remote::hf::endpoint
safety: inline::llama-guard memory:
agents: inline::meta-reference - inline::faiss
telemetry: inline::meta-reference - remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,97 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::hf::endpoint"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="hf-endpoint",
provider_type="remote::hf::endpoint",
config=InferenceEndpointImplConfig.sample_run_config(),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="hf-endpoint",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="hf-endpoint-safety",
)
return DistributionTemplate(
name="hf-endpoint",
distro_type="self_hosted",
description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
docker_image=None,
template_path=None,
providers=providers,
default_models=[inference_model, safety_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
Provider(
provider_id="hf-endpoint-safety",
provider_type="remote::hf::endpoint",
config=InferenceEndpointImplConfig.sample_run_config(
endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}",
),
),
]
},
default_models=[
inference_model,
safety_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"HF_API_TOKEN": (
"hf_...",
"Hugging Face API token",
),
"INFERENCE_ENDPOINT_NAME": (
"",
"HF Inference endpoint name for the main inference model",
),
"SAFETY_INFERENCE_ENDPOINT_NAME": (
"",
"HF Inference endpoint for the safety model",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model served by the HF Inference Endpoint",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Safety model served by the HF Inference Endpoint",
),
},
)

View file

@ -0,0 +1,68 @@
version: '2'
image_name: hf-endpoint
docker_image: null
conda_env: hf-endpoint
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-endpoint
provider_type: remote::hf::endpoint
config:
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN}
- provider_id: hf-endpoint-safety
provider_type: remote::hf::endpoint
config:
endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-endpoint
provider_model_id: null
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: hf-endpoint-safety
provider_model_id: null
shields:
- params: null
shield_id: ${env.SAFETY_MODEL}
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,55 @@
version: '2'
image_name: hf-endpoint
docker_image: null
conda_env: hf-endpoint
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-endpoint
provider_type: remote::hf::endpoint
config:
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-endpoint
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .hf_serverless import get_distribution_template # noqa: F401

View file

@ -1,9 +1,19 @@
version: '2'
name: hf-serverless name: hf-serverless
distribution_spec: distribution_spec:
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference." description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
docker_image: null
providers: providers:
inference: remote::hf::serverless inference:
memory: inline::faiss - remote::hf::serverless
safety: inline::llama-guard memory:
agents: inline::meta-reference - inline::faiss
telemetry: inline::meta-reference - remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,89 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::hf::serverless"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="hf-serverless",
provider_type="remote::hf::serverless",
config=InferenceAPIImplConfig.sample_run_config(),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="hf-serverless",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="hf-serverless-safety",
)
return DistributionTemplate(
name="hf-serverless",
distro_type="self_hosted",
description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
docker_image=None,
template_path=None,
providers=providers,
default_models=[inference_model, safety_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
Provider(
provider_id="hf-serverless-safety",
provider_type="remote::hf::serverless",
config=InferenceAPIImplConfig.sample_run_config(
repo="${env.SAFETY_MODEL}",
),
),
]
},
default_models=[
inference_model,
safety_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"HF_API_TOKEN": (
"hf_...",
"Hugging Face API token",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model to be served by the HF Serverless endpoint",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Safety model to be served by the HF Serverless endpoint",
),
},
)

View file

@ -0,0 +1,68 @@
version: '2'
image_name: hf-serverless
docker_image: null
conda_env: hf-serverless
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-serverless
provider_type: remote::hf::serverless
config:
huggingface_repo: ${env.INFERENCE_MODEL}
api_token: ${env.HF_API_TOKEN}
- provider_id: hf-serverless-safety
provider_type: remote::hf::serverless
config:
huggingface_repo: ${env.SAFETY_MODEL}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-serverless
provider_model_id: null
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: hf-serverless-safety
provider_model_id: null
shields:
- params: null
shield_id: ${env.SAFETY_MODEL}
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,55 @@
version: '2'
image_name: hf-serverless
docker_image: null
conda_env: hf-serverless
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-serverless
provider_type: remote::hf::serverless
config:
huggingface_repo: ${env.INFERENCE_MODEL}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-serverless
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,13 +0,0 @@
name: meta-reference-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: inline::meta-reference
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference

View file

@ -40,9 +40,7 @@ LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \ llamastack/distribution-{{ name }} \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
``` ```
@ -53,9 +51,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \ llamastack/distribution-{{ name }} \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@ -66,8 +62,8 @@ docker run \
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
llama stack build --template meta-reference-gpu --image-type conda llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \ llama stack run distributions/{{ name }}/run.yaml \
--port 5001 \ --port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
``` ```
@ -75,7 +71,7 @@ llama stack run ./run.yaml \
If you are using Llama Stack Safety / Shield APIs, use: If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
llama stack run ./run-with-safety.yaml \ llama stack run distributions/{{ name }}/run-with-safety.yaml \
--port 5001 \ --port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .meta_reference import get_distribution_template # noqa: F401

View file

@ -0,0 +1,54 @@
# Meta Reference Quantized Distribution
The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |------------------------ |---------------- |-------------------------------------------------- |---------------- |---------------- |
| **Provider(s)** | meta-reference-quantized | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference |
The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
### Step 0. Prerequisite - Downloading Models
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
```
$ ls ~/.llama/checkpoints
Llama3.2-3B-Instruct:int4-qlora-eo8
```
### Step 1. Start the Distribution
#### (Option 1) Start with Docker
```
$ cd distributions/meta-reference-quantized-gpu && docker compose up
```
> [!NOTE]
> This assumes you have access to GPU to start a local server with access to your GPU.
> [!NOTE]
> `~/.llama` should be the path containing downloaded weights of Llama models.
This will download and start running a pre-built docker container. Alternatively, you may use the following commands:
```
docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-quantized-gpu --yaml_config /root/my-run.yaml
```
#### (Option 2) Start with Conda
1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
2. Build the `meta-reference-quantized-gpu` distribution
```
$ llama stack build --template meta-reference-quantized-gpu --image-type conda
```
3. Start running distribution
```
$ cd distributions/meta-reference-quantized-gpu
$ llama stack run ./run.yaml
```

View file

@ -0,0 +1,100 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.inline.inference.meta_reference import (
MetaReferenceInferenceConfig,
)
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["inline::meta-reference"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="meta-reference-inference",
provider_type="inline::meta-reference",
config=MetaReferenceInferenceConfig.sample_run_config(
model="${env.INFERENCE_MODEL}",
checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="meta-reference-inference",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="meta-reference-safety",
)
return DistributionTemplate(
name="meta-reference-gpu",
distro_type="self_hosted",
description="Use Meta Reference for running LLM inference",
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[inference_model, safety_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
Provider(
provider_id="meta-reference-safety",
provider_type="inline::meta-reference",
config=MetaReferenceInferenceConfig.sample_run_config(
model="${env.SAFETY_MODEL}",
checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:null}",
),
),
],
},
default_models=[
inference_model,
safety_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model loaded into the Meta Reference server",
),
"INFERENCE_CHECKPOINT_DIR": (
"null",
"Directory containing the Meta Reference model checkpoint",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Name of the safety (Llama-Guard) model to use",
),
"SAFETY_CHECKPOINT_DIR": (
"null",
"Directory containing the Llama-Guard model checkpoint",
),
},
)

View file

@ -55,9 +55,7 @@ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \ -v ~/.llama:/root/.llama \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \ llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434 --env OLLAMA_URL=http://host.docker.internal:11434
@ -86,7 +84,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash ```bash
export LLAMA_STACK_PORT=5001 export LLAMA_STACK_PORT=5001
llama stack build --template ollama --image-type conda llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \ llama stack run ./run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \

View file

@ -27,7 +27,7 @@ from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
class RunConfigSettings(BaseModel): class RunConfigSettings(BaseModel):
provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict) provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
default_models: List[ModelInput] default_models: Optional[List[ModelInput]] = None
default_shields: Optional[List[ShieldInput]] = None default_shields: Optional[List[ShieldInput]] = None
def run_config( def run_config(
@ -87,7 +87,7 @@ class RunConfigSettings(BaseModel):
__distro_dir__=f"distributions/{name}", __distro_dir__=f"distributions/{name}",
db_name="registry.db", db_name="registry.db",
), ),
models=self.default_models, models=self.default_models or [],
shields=self.default_shields or [], shields=self.default_shields or [],
) )
@ -104,7 +104,7 @@ class DistributionTemplate(BaseModel):
providers: Dict[str, List[str]] providers: Dict[str, List[str]]
run_configs: Dict[str, RunConfigSettings] run_configs: Dict[str, RunConfigSettings]
template_path: Path template_path: Optional[Path] = None
# Optional configuration # Optional configuration
run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
@ -159,6 +159,7 @@ class DistributionTemplate(BaseModel):
with open(yaml_output_dir / yaml_pth, "w") as f: with open(yaml_output_dir / yaml_pth, "w") as f:
yaml.safe_dump(run_config.model_dump(), f, sort_keys=False) yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
docs = self.generate_markdown_docs() if self.template_path:
with open(doc_output_dir / f"{self.name}.md", "w") as f: docs = self.generate_markdown_docs()
f.write(docs if docs.endswith("\n") else docs + "\n") with open(doc_output_dir / f"{self.name}.md", "w") as f:
f.write(docs if docs.endswith("\n") else docs + "\n")

View file

@ -71,9 +71,7 @@ LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \ llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@ -102,18 +100,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash ```bash
llama stack build --template {{ name }} --image-type conda llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml llama stack run ./run.yaml
--port 5001 --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
``` ```
If you are using Llama Stack Safety / Shield APIs, use: If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
llama stack run ./run-with-safety.yaml llama stack run ./run-with-safety.yaml \
--port 5001 --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
--env SAFETY_MODEL=$SAFETY_MODEL --env SAFETY_MODEL=$SAFETY_MODEL \
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
``` ```

View file

@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \ llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY --env TOGETHER_API_KEY=$TOGETHER_API_KEY
``` ```
@ -53,8 +51,8 @@ docker run \
### Via Conda ### Via Conda
```bash ```bash
llama stack build --template together --image-type conda llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \ llama stack run ./run.yaml \
--port 5001 \ --port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY --env TOGETHER_API_KEY=$TOGETHER_API_KEY
``` ```

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .vllm import get_distribution_template # noqa: F401

View file

@ -0,0 +1,19 @@
version: '2'
name: vllm-gpu
distribution_spec:
description: Use a built-in vLLM engine for running LLM inference
docker_image: null
providers:
inference:
- inline::vllm
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,58 @@
version: '2'
image_name: vllm-gpu
docker_image: null
conda_env: vllm-gpu
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: vllm
provider_type: inline::vllm
config:
model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
max_tokens: ${env.MAX_TOKENS:4096}
enforce_eager: ${env.ENFORCE_EAGER:False}
gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,74 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.datatypes import ModelInput, Provider
from llama_stack.providers.inline.inference.vllm import VLLMConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["inline::vllm"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="vllm",
provider_type="inline::vllm",
config=VLLMConfig.sample_run_config(),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="vllm",
)
return DistributionTemplate(
name="vllm-gpu",
distro_type="self_hosted",
description="Use a built-in vLLM engine for running LLM inference",
docker_image=None,
template_path=None,
providers=providers,
default_models=[inference_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model loaded into the vLLM engine",
),
"TENSOR_PARALLEL_SIZE": (
"1",
"Number of tensor parallel replicas (number of GPUs to use).",
),
"MAX_TOKENS": (
"4096",
"Maximum number of tokens to generate.",
),
"ENFORCE_EAGER": (
"False",
"Whether to use eager mode for inference (otherwise cuda graphs are used).",
),
"GPU_MEMORY_UTILIZATION": (
"0.7",
"GPU memory utilization for the vLLM engine.",
),
},
)