Merge branch 'main' into add-nvidia-inference-adapter

This commit is contained in:
Matthew Farrellee 2024-11-21 06:49:13 -05:00
commit 5fbfb9d854
92 changed files with 2145 additions and 678 deletions

View file

@ -1,45 +0,0 @@
version: '2'
image_name: local
name: bedrock
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: bedrock0
provider_type: remote::bedrock
config:
aws_access_key_id: <AWS_ACCESS_KEY_ID>
aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
aws_session_token: <AWS_SESSION_TOKEN>
region_name: <AWS_REGION>
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
safety:
- provider_id: bedrock0
provider_type: remote::bedrock
config:
aws_access_key_id: <AWS_ACCESS_KEY_ID>
aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
aws_session_token: <AWS_SESSION_TOKEN>
region_name: <AWS_REGION>
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -0,0 +1 @@
../../llama_stack/templates/bedrock/run.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/databricks/build.yaml

View file

@ -1,4 +1,32 @@
{
"hf-serverless": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"together": [
"aiosqlite",
"blobfile",
@ -26,6 +54,33 @@
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"vllm-gpu": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"vllm",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"remote-vllm": [
"aiosqlite",
"blobfile",
@ -108,6 +163,33 @@
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"bedrock": [
"aiosqlite",
"blobfile",
"boto3",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"meta-reference-gpu": [
"accelerate",
"aiosqlite",
@ -140,6 +222,40 @@
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"meta-reference-quantized-gpu": [
"accelerate",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"fairscale",
"faiss-cpu",
"fastapi",
"fbgemm-gpu",
"fire",
"httpx",
"lm-format-enforcer",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"torch",
"torchao==0.5.0",
"torchvision",
"tqdm",
"transformers",
"uvicorn",
"zmq",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"ollama": [
"aiohttp",
"aiosqlite",
@ -167,5 +283,33 @@
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"hf-endpoint": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
]
}

View file

@ -1 +0,0 @@
../../llama_stack/templates/hf-endpoint/build.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/hf-serverless/build.yaml

View file

@ -1 +0,0 @@
../../llama_stack/templates/ollama/build.yaml

View file

@ -1,48 +0,0 @@
services:
ollama:
image: ollama/ollama:latest
network_mode: "host"
volumes:
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
ports:
- "11434:11434"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
- ollama
image: llamastack/distribution-ollama
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ./run.yaml:/root/llamastack-run-ollama.yaml
ports:
- "5000:5000"
# Hack: wait for ollama server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
volumes:
ollama:

View file

@ -1,46 +0,0 @@
version: '2'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: ollama
provider_type: remote::ollama
config:
url: ${env.OLLAMA_URL:http://127.0.0.1:11434}
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
excluded_categories: []
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
models:
- model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
provider_id: ollama
- model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
provider_id: ollama
shields:
- shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}

1
docs/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
src

View file

@ -0,0 +1,139 @@
# Llama Stack Distributions
```{toctree}
:maxdepth: 2
:hidden:
self_hosted_distro/index
remote_hosted_distro/index
ondevice_distro/index
```
## Introduction
Llama Stack Distributions are pre-built Docker containers/Conda environments that assemble APIs and Providers to provide a consistent whole to the end application developer.
These distributions allow you to mix-and-match providers - some could be backed by local code and some could be remote. This flexibility enables you to choose the optimal setup for your use case, such as serving a small model locally while using a cloud provider for larger models, all while maintaining a consistent API interface for your application.
## Decide Your Build Type
There are two ways to start a Llama Stack:
- **Docker**: we provide a number of pre-built Docker containers allowing you to get started instantly. If you are focused on application development, we recommend this option.
- **Conda**: the `llama` CLI provides a simple set of commands to build, configure and run a Llama Stack server containing the exact combination of providers you wish. We have provided various templates to make getting started easier.
Both of these provide options to run model inference using our reference implementations, Ollama, TGI, vLLM or even remote providers like Fireworks, Together, Bedrock, etc.
### Decide Your Inference Provider
Running inference on the underlying Llama model is one of the most critical requirements. Depending on what hardware you have available, you have various options. Note that each option have different necessary prerequisites.
- **Do you have access to a machine with powerful GPUs?**
If so, we suggest:
- [distribution-meta-reference-gpu](./self_hosted_distro/meta-reference-gpu.md)
- [distribution-tgi](./self_hosted_distro/tgi.md)
- **Are you running on a "regular" desktop machine?**
If so, we suggest:
- [distribution-ollama](./self_hosted_distro/ollama.md)
- **Do you have an API key for a remote inference provider like Fireworks, Together, etc.?** If so, we suggest:
- [distribution-together](./remote_hosted_distro/together.md)
- [distribution-fireworks](./remote_hosted_distro/fireworks.md)
- **Do you want to run Llama Stack inference on your iOS / Android device** If so, we suggest:
- [iOS](./ondevice_distro/ios_sdk.md)
- [Android](https://github.com/meta-llama/llama-stack-client-kotlin) (coming soon)
Please see our pages in detail for the types of distributions we offer:
1. [Self-Hosted Distributions](./self_hosted_distro/index.md): If you want to run Llama Stack inference on your local machine.
2. [Remote-Hosted Distributions](./remote_hosted_distro/index.md): If you want to connect to a remote hosted inference provider.
3. [On-device Distributions](./ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.
## Building Your Own Distribution
### Prerequisites
```bash
$ git clone git@github.com:meta-llama/llama-stack.git
```
### Starting the Distribution
::::{tab-set}
:::{tab-item} meta-reference-gpu
##### System Requirements
Access to Single-Node GPU to start a local server.
##### Downloading Models
Please make sure you have Llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../cli_reference/download_models.md) here to download the models.
```
$ ls ~/.llama/checkpoints
Llama3.1-8B Llama3.2-11B-Vision-Instruct Llama3.2-1B-Instruct Llama3.2-90B-Vision-Instruct Llama-Guard-3-8B
Llama3.1-8B-Instruct Llama3.2-1B Llama3.2-3B-Instruct Llama-Guard-3-1B Prompt-Guard-86M
```
:::
:::{tab-item} vLLM
##### System Requirements
Access to Single-Node GPU to start a vLLM server.
:::
:::{tab-item} tgi
##### System Requirements
Access to Single-Node GPU to start a TGI server.
:::
:::{tab-item} ollama
##### System Requirements
Access to Single-Node CPU/GPU able to run ollama.
:::
:::{tab-item} together
##### System Requirements
Access to Single-Node CPU with Together hosted endpoint via API_KEY from [together.ai](https://api.together.xyz/signin).
:::
:::{tab-item} fireworks
##### System Requirements
Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [fireworks.ai](https://fireworks.ai/).
:::
::::
::::{tab-set}
:::{tab-item} meta-reference-gpu
- [Start Meta Reference GPU Distribution](./self_hosted_distro/meta-reference-gpu.md)
:::
:::{tab-item} vLLM
- [Start vLLM Distribution](./self_hosted_distro/remote-vllm.md)
:::
:::{tab-item} tgi
- [Start TGI Distribution](./self_hosted_distro/tgi.md)
:::
:::{tab-item} ollama
- [Start Ollama Distribution](./self_hosted_distro/ollama.md)
:::
:::{tab-item} together
- [Start Together Distribution](./self_hosted_distro/together.md)
:::
:::{tab-item} fireworks
- [Start Fireworks Distribution](./self_hosted_distro/fireworks.md)
:::
::::
### Troubleshooting
- If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
- Use `--port <PORT>` flag to use a different port number. For docker run, update the `-p <PORT>:<PORT>` flag.

View file

@ -1,4 +1,4 @@
# On-Device Distribution
# On-Device Distributions
On-device distributions are Llama Stack distributions that run locally on your iOS / Android device.

View file

@ -1,4 +1,11 @@
# Remote-Hosted Distribution
# Remote-Hosted Distributions
```{toctree}
:maxdepth: 2
:hidden:
remote
```
Remote-Hosted distributions are available endpoints serving Llama Stack API that you can directly connect to.

View file

@ -0,0 +1,64 @@
# Bedrock Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-bedrock` distribution consists of the following provider configurations:
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| inference | `remote::bedrock` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `remote::bedrock` |
| telemetry | `inline::meta-reference` |
### Environment Variables
The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
### Prerequisite: API Keys
Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
## Running Llama Stack with AWS Bedrock
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-bedrock \
--port $LLAMA_STACK_PORT \
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
```
### Via Conda
```bash
llama stack build --template bedrock --image-type conda
llama stack run ./run.yaml \
--port $LLAMA_STACK_PORT \
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
```

View file

@ -1,5 +1,12 @@
# Dell-TGI Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-tgi` distribution consists of the following provider configurations.

View file

@ -1,5 +1,12 @@
# Fireworks Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-fireworks` distribution consists of the following provider configurations.
| API | Provider(s) |
@ -51,9 +58,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-fireworks \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
```
@ -63,6 +68,6 @@ docker run \
```bash
llama stack build --template fireworks --image-type conda
llama stack run ./run.yaml \
--port 5001 \
--port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
```

View file

@ -1,20 +1,8 @@
# Self-Hosted Distribution
We offer deployable distributions where you can host your own Llama Stack server using local inference.
| **Distribution** | **Llama Stack Docker** | Start This Distribution | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|:----------------: |:------------------------------------------: |:-----------------------: |:------------------: |:------------------: |:------------------: |:------------------: |:------------------: |
| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html) | meta-reference | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference |
| Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) | meta-reference-quantized | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference |
| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html) | remote::ollama | meta-reference | remote::pgvector; remote::chromadb | meta-reference | meta-reference |
| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html) | remote::tgi | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference |
| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html) | remote::together | meta-reference | remote::weaviate | meta-reference | meta-reference |
| Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html) | remote::fireworks | meta-reference | remote::weaviate | meta-reference | meta-reference |
| Bedrock | [llamastack/distribution-bedrock](https://hub.docker.com/repository/docker/llamastack/distribution-bedrock/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/bedrock.html) | remote::bedrock | meta-reference | remote::weaviate | meta-reference | meta-reference |
# Self-Hosted Distributions
```{toctree}
:maxdepth: 1
:maxdepth: 2
:hidden:
meta-reference-gpu
meta-reference-quantized-gpu
@ -26,3 +14,15 @@ fireworks
remote-vllm
bedrock
```
We offer deployable distributions where you can host your own Llama Stack server using local inference.
| **Distribution** | **Llama Stack Docker** | Start This Distribution |
|:----------------: |:------------------------------------------: |:-----------------------: |
| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html) |
| Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html) |
| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html) |
| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html) |
| Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html) |
| Bedrock | [llamastack/distribution-bedrock](https://hub.docker.com/repository/docker/llamastack/distribution-bedrock/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/bedrock.html) |

View file

@ -1,5 +1,12 @@
# Meta Reference Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-meta-reference-gpu` distribution consists of the following provider configurations:
| API | Provider(s) |
@ -47,9 +54,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-meta-reference-gpu \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
@ -60,9 +65,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-meta-reference-gpu \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@ -74,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash
llama stack build --template meta-reference-gpu --image-type conda
llama stack run ./run.yaml \
llama stack run distributions/meta-reference-gpu/run.yaml \
--port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
@ -82,7 +85,7 @@ llama stack run ./run.yaml \
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run ./run-with-safety.yaml \
llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
--port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

View file

@ -0,0 +1,92 @@
# Meta Reference Quantized Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations:
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| inference | `inline::meta-reference-quantized` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` |
| telemetry | `inline::meta-reference` |
The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
### Environment Variables
The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
## Prerequisite: Downloading Models
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
```
$ ls ~/.llama/checkpoints
Llama3.1-8B Llama3.2-11B-Vision-Instruct Llama3.2-1B-Instruct Llama3.2-90B-Vision-Instruct Llama-Guard-3-8B
Llama3.1-8B-Instruct Llama3.2-1B Llama3.2-3B-Instruct Llama-Guard-3-1B Prompt-Guard-86M
```
## Running the Distribution
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-meta-reference-quantized-gpu \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-meta-reference-quantized-gpu \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
```
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template meta-reference-quantized-gpu --image-type conda
llama stack run distributions/meta-reference-quantized-gpu/run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run distributions/meta-reference-quantized-gpu/run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
```

View file

@ -1,5 +1,12 @@
# Ollama Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-ollama` distribution consists of the following provider configurations.
| API | Provider(s) |
@ -59,9 +66,7 @@ docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-ollama \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434

View file

@ -1,4 +1,10 @@
# Remote vLLM Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations:

View file

@ -1,5 +1,12 @@
# TGI Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-tgi` distribution consists of the following provider configurations.
| API | Provider(s) |
@ -78,9 +85,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-tgi \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@ -109,18 +114,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash
llama stack build --template tgi --image-type conda
llama stack run ./run.yaml
--port 5001
--env INFERENCE_MODEL=$INFERENCE_MODEL
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run ./run-with-safety.yaml
--port 5001
--env INFERENCE_MODEL=$INFERENCE_MODEL
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
--env SAFETY_MODEL=$SAFETY_MODEL
llama stack run ./run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
```

View file

@ -1,4 +1,11 @@
# Fireworks Distribution
# Together Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-together` distribution consists of the following provider configurations.
@ -50,9 +57,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-together \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
```
@ -62,6 +67,6 @@ docker run \
```bash
llama stack build --template together --image-type conda
llama stack run ./run.yaml \
--port 5001 \
--port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
```

View file

@ -1,58 +0,0 @@
# Bedrock Distribution
### Connect to a Llama Stack Bedrock Endpoint
- You may connect to Amazon Bedrock APIs for running LLM inference
The `llamastack/distribution-bedrock` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |--------------- |---------------- |---------------- |---------------- |---------------- |
| **Provider(s)** | remote::bedrock | meta-reference | meta-reference | remote::bedrock | meta-reference |
### Docker: Start the Distribution (Single Node CPU)
> [!NOTE]
> This assumes you have valid AWS credentials configured with access to Amazon Bedrock.
```
$ cd distributions/bedrock && docker compose up
```
Make sure in your `run.yaml` file, your inference provider is pointing to the correct AWS configuration. E.g.
```
inference:
- provider_id: bedrock0
provider_type: remote::bedrock
config:
aws_access_key_id: <AWS_ACCESS_KEY_ID>
aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
aws_session_token: <AWS_SESSION_TOKEN>
region_name: <AWS_REGION>
```
### Conda llama stack run (Single Node CPU)
```bash
llama stack build --template bedrock --image-type conda
# -- modify run.yaml with valid AWS credentials
llama stack run ./run.yaml
```
### (Optional) Update Model Serving Configuration
Use `llama-stack-client models list` to check the available models served by Amazon Bedrock.
```
$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
| identifier | llama_model | provider_id | metadata |
+==============================+==============================+===============+============+
| Llama3.1-8B-Instruct | meta.llama3-1-8b-instruct-v1:0 | bedrock0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-70B-Instruct | meta.llama3-1-70b-instruct-v1:0 | bedrock0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-405B-Instruct | meta.llama3-1-405b-instruct-v1:0 | bedrock0 | {} |
+------------------------------+------------------------------+---------------+------------+
```

View file

@ -1,54 +0,0 @@
# Meta Reference Quantized Distribution
The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |------------------------ |---------------- |-------------------------------------------------- |---------------- |---------------- |
| **Provider(s)** | meta-reference-quantized | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference |
The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
### Step 0. Prerequisite - Downloading Models
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
```
$ ls ~/.llama/checkpoints
Llama3.2-3B-Instruct:int4-qlora-eo8
```
### Step 1. Start the Distribution
#### (Option 1) Start with Docker
```
$ cd distributions/meta-reference-quantized-gpu && docker compose up
```
> [!NOTE]
> This assumes you have access to GPU to start a local server with access to your GPU.
> [!NOTE]
> `~/.llama` should be the path containing downloaded weights of Llama models.
This will download and start running a pre-built docker container. Alternatively, you may use the following commands:
```
docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-quantized-gpu --yaml_config /root/my-run.yaml
```
#### (Option 2) Start with Conda
1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
2. Build the `meta-reference-quantized-gpu` distribution
```
$ llama stack build --template meta-reference-quantized-gpu --image-type conda
```
3. Start running distribution
```
$ cd distributions/meta-reference-quantized-gpu
$ llama stack run ./run.yaml
```

View file

@ -1,194 +1,208 @@
# Getting Started
# Getting Started with Llama Stack
```{toctree}
:maxdepth: 2
:hidden:
distributions/self_hosted_distro/index
distributions/remote_hosted_distro/index
distributions/ondevice_distro/index
In this guide, we'll walk through using ollama as the inference provider and build a simple python application that uses the Llama Stack Client SDK
Llama stack consists of a distribution server and an accompanying client SDK. The distribution server can be configured for different providers for inference, memory, agents, evals etc. This configuration is defined in a yaml file called `run.yaml`.
Running inference on the underlying Llama model is one of the most critical requirements. Depending on what hardware you have available, you have various options. Note that each option have different necessary prerequisites. We will use ollama as the inference provider as it is the easiest to get started with.
### Step 1. Start the inference server
```bash
export LLAMA_STACK_PORT=5001
export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
# ollama names this model differently, and we must use the ollama name when loading the model
export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
```
At the end of the guide, you will have learned how to:
- get a Llama Stack server up and running
- set up an agent (with tool-calling and vector stores) that works with the above server
To see more example apps built using Llama Stack, see [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main).
## Step 1. Starting Up Llama Stack Server
### Decide Your Build Type
There are two ways to start a Llama Stack:
- **Docker**: we provide a number of pre-built Docker containers allowing you to get started instantly. If you are focused on application development, we recommend this option.
- **Conda**: the `llama` CLI provides a simple set of commands to build, configure and run a Llama Stack server containing the exact combination of providers you wish. We have provided various templates to make getting started easier.
Both of these provide options to run model inference using our reference implementations, Ollama, TGI, vLLM or even remote providers like Fireworks, Together, Bedrock, etc.
### Decide Your Inference Provider
Running inference on the underlying Llama model is one of the most critical requirements. Depending on what hardware you have available, you have various options. Note that each option have different necessary prerequisites.
- **Do you have access to a machine with powerful GPUs?**
If so, we suggest:
- [distribution-meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
- [distribution-tgi](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/tgi.html)
- **Are you running on a "regular" desktop machine?**
If so, we suggest:
- [distribution-ollama](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
- **Do you have an API key for a remote inference provider like Fireworks, Together, etc.?** If so, we suggest:
- [distribution-together](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/together.html)
- [distribution-fireworks](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/fireworks.html)
- **Do you want to run Llama Stack inference on your iOS / Android device** If so, we suggest:
- [iOS](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/ondevice_distro/ios_sdk.html)
- [Android](https://github.com/meta-llama/llama-stack-client-kotlin) (coming soon)
Please see our pages in detail for the types of distributions we offer:
1. [Self-Hosted Distribution](./distributions/self_hosted_distro/index.md): If you want to run Llama Stack inference on your local machine.
2. [Remote-Hosted Distribution](./distributions/remote_hosted_distro/index.md): If you want to connect to a remote hosted inference provider.
3. [On-device Distribution](./distributions/ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.
### Table of Contents
Once you have decided on the inference provider and distribution to use, use the following guides to get started.
##### 1.0 Prerequisite
```
$ git clone git@github.com:meta-llama/llama-stack.git
```
::::{tab-set}
:::{tab-item} meta-reference-gpu
##### System Requirements
Access to Single-Node GPU to start a local server.
##### Downloading Models
Please make sure you have Llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
```
$ ls ~/.llama/checkpoints
Llama3.1-8B Llama3.2-11B-Vision-Instruct Llama3.2-1B-Instruct Llama3.2-90B-Vision-Instruct Llama-Guard-3-8B
Llama3.1-8B-Instruct Llama3.2-1B Llama3.2-3B-Instruct Llama-Guard-3-1B Prompt-Guard-86M
```
:::
:::{tab-item} vLLM
##### System Requirements
Access to Single-Node GPU to start a vLLM server.
:::
:::{tab-item} tgi
##### System Requirements
Access to Single-Node GPU to start a TGI server.
:::
:::{tab-item} ollama
##### System Requirements
Access to Single-Node CPU/GPU able to run ollama.
:::
:::{tab-item} together
##### System Requirements
Access to Single-Node CPU with Together hosted endpoint via API_KEY from [together.ai](https://api.together.xyz/signin).
:::
:::{tab-item} fireworks
##### System Requirements
Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [fireworks.ai](https://fireworks.ai/).
:::
::::
##### 1.1. Start the distribution
::::{tab-set}
:::{tab-item} meta-reference-gpu
- [Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
:::
:::{tab-item} vLLM
- [Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html)
:::
:::{tab-item} tgi
- [Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)
:::
:::{tab-item} ollama
- [Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
:::
:::{tab-item} together
- [Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)
:::
:::{tab-item} fireworks
- [Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)
:::
::::
##### Troubleshooting
- If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
- Use `--port <PORT>` flag to use a different port number. For docker run, update the `-p <PORT>:<PORT>` flag.
## Step 2. Run Llama Stack App
### Chat Completion Test
Once the server is set up, we can test it with a client to verify it's working correctly. The following command will send a chat completion request to the server's `/inference/chat_completion` API:
### Step 2. Start the Llama Stack server
```bash
$ curl http://localhost:5000/alpha/inference/chat-completion \
-H "Content-Type: application/json" \
-d '{
"model_id": "meta-llama/Llama-3.1-8B-Instruct",
"messages": [
export LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
llamastack/distribution-ollama \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434
```
### Step 3. Use the Llama Stack client SDK
```bash
pip install llama-stack-client
```
We will use the `llama-stack-client` CLI to check the connectivity to the server. This should be installed in your environment if you installed the SDK.
```bash
llama-stack-client --endpoint http://localhost:5001 models list
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ identifier ┃ provider_id ┃ provider_resource_id ┃ metadata ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
│ meta-llama/Llama-3.2-3B-Instruct │ ollama │ llama3.2:3b-instruct-fp16 │ {} │
└──────────────────────────────────┴─────────────┴───────────────────────────┴──────────┘
```
Chat completion using the CLI
```bash
llama-stack-client --endpoint http://localhost:5001 inference chat_completion --message "hello, what model are you?"
```
Simple python example using the client SDK
```python
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url="http://localhost:5001")
# List available models
models = client.models.list()
print(models)
# Simple chat completion
response = client.inference.chat_completion(
model_id="meta-llama/Llama-3.2-3B-Instruct",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write me a 2 sentence poem about the moon"}
],
"sampling_params": {"temperature": 0.7, "seed": 42, "max_tokens": 512}
}'
Output:
{'completion_message': {'role': 'assistant',
'content': 'The moon glows softly in the midnight sky, \nA beacon of wonder, as it catches the eye.',
'stop_reason': 'out_of_tokens',
'tool_calls': []},
'logprobs': null}
{"role": "user", "content": "Write a haiku about coding"}
]
)
print(response.completion_message.content)
```
### Run Agent App
### Step 4. Your first RAG agent
```python
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
To run an agent app, check out examples demo scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo. To run a simple agent app:
import asyncio
```bash
$ git clone git@github.com:meta-llama/llama-stack-apps.git
$ cd llama-stack-apps
$ pip install -r requirements.txt
import fire
$ python -m examples.agents.client <host> <port>
from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types import Attachment
from llama_stack_client.types.agent_create_params import AgentConfig
async def run_main(host: str, port: int, disable_safety: bool = False):
urls = [
"memory_optimizations.rst",
"chat.rst",
"llama3.rst",
"datasets.rst",
"qat_finetune.rst",
"lora_finetune.rst",
]
attachments = [
Attachment(
content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
mime_type="text/plain",
)
for i, url in enumerate(urls)
]
client = LlamaStackClient(
base_url=f"http://{host}:{port}",
)
available_shields = [shield.identifier for shield in client.shields.list()]
if not available_shields:
print("No available shields. Disable safety.")
else:
print(f"Available shields found: {available_shields}")
available_models = [model.identifier for model in client.models.list()]
if not available_models:
raise ValueError("No available models")
else:
selected_model = available_models[0]
print(f"Using model: {selected_model}")
agent_config = AgentConfig(
model=selected_model,
instructions="You are a helpful assistant",
sampling_params={
"strategy": "greedy",
"temperature": 1.0,
"top_p": 0.9,
},
tools=[
{
"type": "memory",
"memory_bank_configs": [],
"query_generator_config": {"type": "default", "sep": " "},
"max_tokens_in_context": 4096,
"max_chunks": 10,
},
],
tool_choice="auto",
tool_prompt_format="json",
input_shields=available_shields if available_shields else [],
output_shields=available_shields if available_shields else [],
enable_session_persistence=False,
)
agent = Agent(client, agent_config)
session_id = agent.create_session("test-session")
print(f"Created session_id={session_id} for Agent({agent.agent_id})")
user_prompts = [
(
"I am attaching some documentation for Torchtune. Help me answer questions I will ask next.",
attachments,
),
(
"What are the top 5 topics that were explained? Only list succinct bullet points.",
None,
),
(
"Was anything related to 'Llama3' discussed, if so what?",
None,
),
(
"Tell me how to use LoRA",
None,
),
(
"What about Quantization?",
None,
),
]
for prompt in user_prompts:
response = agent.create_turn(
messages=[
{
"role": "user",
"content": prompt[0],
}
],
attachments=prompt[1],
session_id=session_id,
)
async for log in EventLogger().log(response):
log.print()
def main(host: str, port: int):
asyncio.run(run_main(host, port))
if __name__ == "__main__":
fire.Fire(main)
```
You will see outputs of the form --
```
User> I am planning a trip to Switzerland, what are the top 3 places to visit?
inference> Switzerland is a beautiful country with a rich history, stunning landscapes, and vibrant culture. Here are three must-visit places to add to your itinerary:
...
## Next Steps
User> What is so special about #1?
inference> Jungfraujoch, also known as the "Top of Europe," is a unique and special place for several reasons:
...
- You can mix and match different providers for inference, memory, agents, evals etc. See [Building custom distributions](../distributions/index.md)
- [Developer Cookbook](developer_cookbook.md)
User> What other countries should I consider to club?
inference> Considering your interest in Switzerland, here are some neighboring countries that you may want to consider visiting:
```
For example applications and more detailed tutorials, visit our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository.

View file

@ -7,8 +7,7 @@ The Stack APIs are rapidly improving but still a work-in-progress. We invite fee
```{image} ../_static/llama-stack.png
:alt: Llama Stack
:width: 600px
:align: center
:width: 400px
```
## APIs
@ -86,8 +85,10 @@ You can find more example scripts with client SDKs to talk with the Llama Stack
:maxdepth: 3
getting_started/index
cli_reference/index
cli_reference/download_models
distributions/index
llama_cli_reference/index
llama_cli_reference/download_models
llama_stack_client_cli_reference/index
api_providers/index
distribution_dev/index
```

View file

@ -1,4 +1,4 @@
# CLI Reference
# llama CLI Reference
The `llama` CLI tool helps you setup and use the Llama Stack. It should be available on your path after installing the `llama-stack` package.
@ -119,7 +119,7 @@ You should see a table like this:
To download models, you can use the llama download command.
#### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/)
@ -137,7 +137,7 @@ llama download --source meta --model-id Prompt-Guard-86M --meta-url META_URL
llama download --source meta --model-id Llama-Guard-3-1B --meta-url META_URL
```
#### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
Essentially, the same commands above work, just replace `--source meta` with `--source huggingface`.

View file

@ -0,0 +1,162 @@
# llama-stack-client CLI Reference
You may use the `llama-stack-client` to query information about the distribution.
## Basic Commands
### `llama-stack-client`
```bash
$ llama-stack-client -h
usage: llama-stack-client [-h] {models,memory_banks,shields} ...
Welcome to the LlamaStackClient CLI
options:
-h, --help show this help message and exit
subcommands:
{models,memory_banks,shields}
```
### `llama-stack-client configure`
```bash
$ llama-stack-client configure
> Enter the host name of the Llama Stack distribution server: localhost
> Enter the port number of the Llama Stack distribution server: 5000
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5000
```
## Provider Commands
### `llama-stack-client providers list`
```bash
$ llama-stack-client providers list
```
```
+-----------+----------------+-----------------+
| API | Provider ID | Provider Type |
+===========+================+=================+
| scoring | meta0 | meta-reference |
+-----------+----------------+-----------------+
| datasetio | meta0 | meta-reference |
+-----------+----------------+-----------------+
| inference | tgi0 | remote::tgi |
+-----------+----------------+-----------------+
| memory | meta-reference | meta-reference |
+-----------+----------------+-----------------+
| agents | meta-reference | meta-reference |
+-----------+----------------+-----------------+
| telemetry | meta-reference | meta-reference |
+-----------+----------------+-----------------+
| safety | meta-reference | meta-reference |
+-----------+----------------+-----------------+
```
## Model Management
### `llama-stack-client models list`
```bash
$ llama-stack-client models list
```
```
+----------------------+----------------------+---------------+----------------------------------------------------------+
| identifier | llama_model | provider_id | metadata |
+======================+======================+===============+==========================================================+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | tgi0 | {'huggingface_repo': 'meta-llama/Llama-3.1-8B-Instruct'} |
+----------------------+----------------------+---------------+----------------------------------------------------------+
```
### `llama-stack-client models get`
```bash
$ llama-stack-client models get Llama3.1-8B-Instruct
```
```
+----------------------+----------------------+----------------------------------------------------------+---------------+
| identifier | llama_model | metadata | provider_id |
+======================+======================+==========================================================+===============+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | {'huggingface_repo': 'meta-llama/Llama-3.1-8B-Instruct'} | tgi0 |
+----------------------+----------------------+----------------------------------------------------------+---------------+
```
```bash
$ llama-stack-client models get Random-Model
Model RandomModel is not found at distribution endpoint host:port. Please ensure endpoint is serving specified model.
```
### `llama-stack-client models register`
```bash
$ llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
```
### `llama-stack-client models update`
```bash
$ llama-stack-client models update <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
```
### `llama-stack-client models delete`
```bash
$ llama-stack-client models delete <model_id>
```
## Memory Bank Management
### `llama-stack-client memory_banks list`
```bash
$ llama-stack-client memory_banks list
```
```
+--------------+----------------+--------+-------------------+------------------------+--------------------------+
| identifier | provider_id | type | embedding_model | chunk_size_in_tokens | overlap_size_in_tokens |
+==============+================+========+===================+========================+==========================+
| test_bank | meta-reference | vector | all-MiniLM-L6-v2 | 512 | 64 |
+--------------+----------------+--------+-------------------+------------------------+--------------------------+
```
## Shield Management
### `llama-stack-client shields list`
```bash
$ llama-stack-client shields list
```
```
+--------------+----------+----------------+-------------+
| identifier | params | provider_id | type |
+==============+==========+================+=============+
| llama_guard | {} | meta-reference | llama_guard |
+--------------+----------+----------------+-------------+
```
## Evaluation Tasks
### `llama-stack-client eval_tasks list`
```bash
$ llama-stack-client eval run_benchmark <task_id1> <task_id2> --num-examples 10 --output-dir ./ --eval-task-config ~/eval_task_config.json
```
where `eval_task_config.json` is the path to the eval task config file in JSON format. An example eval_task_config
```
$ cat ~/eval_task_config.json
{
"type": "benchmark",
"eval_candidate": {
"type": "model",
"model": "Llama3.1-405B-Instruct",
"sampling_params": {
"strategy": "greedy",
"temperature": 0,
"top_p": 0.95,
"top_k": 0,
"max_tokens": 0,
"repetition_penalty": 1.0
}
}
}
```

View file

@ -8,7 +8,6 @@ import argparse
from llama_stack.cli.subcommand import Subcommand
from llama_stack.distribution.datatypes import * # noqa: F403
import importlib
import os
import shutil
from functools import lru_cache
@ -258,6 +257,7 @@ class StackBuild(Subcommand):
) -> None:
import json
import os
import re
import yaml
from termcolor import cprint
@ -286,17 +286,19 @@ class StackBuild(Subcommand):
os.makedirs(build_dir, exist_ok=True)
run_config_file = build_dir / f"{build_config.name}-run.yaml"
shutil.copy(template_path, run_config_file)
module_name = f"llama_stack.templates.{template_name}"
module = importlib.import_module(module_name)
distribution_template = module.get_distribution_template()
with open(template_path, "r") as f:
yaml_content = f.read()
# Find all ${env.VARIABLE} patterns
env_vars = set(re.findall(r"\${env\.([A-Za-z0-9_]+)}", yaml_content))
cprint("Build Successful! Next steps: ", color="green")
env_vars = ", ".join(distribution_template.run_config_env_vars.keys())
cprint(
f" 1. Set the environment variables: {env_vars}",
f" 1. Set the environment variables: {list(env_vars)}",
color="green",
)
cprint(
f" 2. `llama stack run {run_config_file}`",
f" 2. Run: `llama stack run {template_name}`",
color="green",
)
else:

View file

@ -5,9 +5,12 @@
# the root directory of this source tree.
import argparse
from pathlib import Path
from llama_stack.cli.subcommand import Subcommand
REPO_ROOT = Path(__file__).parent.parent.parent.parent
class StackRun(Subcommand):
def __init__(self, subparsers: argparse._SubParsersAction):
@ -48,8 +51,6 @@ class StackRun(Subcommand):
)
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
from pathlib import Path
import pkg_resources
import yaml
@ -66,19 +67,27 @@ class StackRun(Subcommand):
return
config_file = Path(args.config)
if not config_file.exists() and not args.config.endswith(".yaml"):
has_yaml_suffix = args.config.endswith(".yaml")
if not config_file.exists() and not has_yaml_suffix:
# check if this is a template
config_file = (
Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
)
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to conda dir
config_file = Path(
BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml"
)
if not config_file.exists() and not args.config.endswith(".yaml"):
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to docker dir
config_file = Path(
BUILDS_BASE_DIR / ImageType.docker.value / f"{args.config}-run.yaml"
)
if not config_file.exists() and not args.config.endswith(".yaml"):
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to ~/.llama dir
config_file = Path(
DISTRIBS_BASE_DIR
@ -92,6 +101,7 @@ class StackRun(Subcommand):
)
return
print(f"Using config file: {config_file}")
config_dict = yaml.safe_load(config_file.read_text())
config = parse_and_maybe_upgrade_config(config_dict)

View file

@ -122,7 +122,7 @@ add_to_docker <<EOF
# This would be good in production but for debugging flexibility lets not add it right now
# We need a more solid production ready entrypoint.sh anyway
#
ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"]
ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--template", "$build_name"]
EOF

View file

@ -170,13 +170,6 @@ class CommonRoutingTableImpl(RoutingTable):
# Get existing objects from registry
existing_obj = await self.dist_registry.get(obj.type, obj.identifier)
# Check for existing registration
if existing_obj and existing_obj.provider_id == obj.provider_id:
print(
f"`{obj.identifier}` already registered with `{existing_obj.provider_id}`"
)
return existing_obj
# if provider_id is not specified, pick an arbitrary one from existing entries
if not obj.provider_id and len(self.impls_by_provider_id) > 0:
obj.provider_id = list(self.impls_by_provider_id.keys())[0]

View file

@ -16,6 +16,7 @@ import traceback
import warnings
from contextlib import asynccontextmanager
from pathlib import Path
from ssl import SSLError
from typing import Any, Dict, Optional
@ -49,6 +50,9 @@ from llama_stack.distribution.stack import (
from .endpoints import get_all_api_endpoints
REPO_ROOT = Path(__file__).parent.parent.parent.parent
def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
log = file if hasattr(file, "write") else sys.stderr
traceback.print_stack(file=log)
@ -279,9 +283,12 @@ def main():
parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
parser.add_argument(
"--yaml-config",
default="llamastack-run.yaml",
help="Path to YAML configuration file",
)
parser.add_argument(
"--template",
help="One of the template names in llama_stack/templates (e.g., tgi, fireworks, remote-vllm, etc.)",
)
parser.add_argument("--port", type=int, default=5000, help="Port to listen on")
parser.add_argument(
"--disable-ipv6", action="store_true", help="Whether to disable IPv6 support"
@ -303,10 +310,29 @@ def main():
print(f"Error: {str(e)}")
sys.exit(1)
with open(args.yaml_config, "r") as fp:
if args.yaml_config:
# if the user provided a config file, use it, even if template was specified
config_file = Path(args.yaml_config)
if not config_file.exists():
raise ValueError(f"Config file {config_file} does not exist")
print(f"Using config file: {config_file}")
elif args.template:
config_file = (
Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml"
)
if not config_file.exists():
raise ValueError(f"Template {args.template} does not exist")
print(f"Using template {args.template} config file: {config_file}")
else:
raise ValueError("Either --yaml-config or --template must be provided")
with open(config_file, "r") as fp:
config = replace_env_vars(yaml.safe_load(fp))
config = StackRunConfig(**config)
print("Run configuration:")
print(yaml.dump(config.model_dump(), indent=2))
app = FastAPI()
try:

View file

@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Optional
from typing import Any, Dict, Optional
from llama_models.datatypes import * # noqa: F403
from llama_models.sku_list import resolve_model
@ -37,8 +37,10 @@ class MetaReferenceInferenceConfig(BaseModel):
@classmethod
def validate_model(cls, model: str) -> str:
permitted_models = supported_inference_models()
if model not in permitted_models:
model_list = "\n\t".join(permitted_models)
descriptors = [m.descriptor() for m in permitted_models]
repos = [m.huggingface_repo for m in permitted_models]
if model not in (descriptors + repos):
model_list = "\n\t".join(repos)
raise ValueError(
f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
)
@ -54,6 +56,7 @@ class MetaReferenceInferenceConfig(BaseModel):
cls,
model: str = "Llama3.2-3B-Instruct",
checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
**kwargs,
) -> Dict[str, Any]:
return {
"model": model,
@ -64,3 +67,16 @@ class MetaReferenceInferenceConfig(BaseModel):
class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
quantization: QuantizationConfig
@classmethod
def sample_run_config(
cls,
model: str = "Llama3.2-3B-Instruct",
checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
**kwargs,
) -> Dict[str, Any]:
config = super().sample_run_config(model, checkpoint_dir, **kwargs)
config["quantization"] = {
"type": "fp8",
}
return config

View file

@ -37,19 +37,22 @@ class VLLMConfig(BaseModel):
@classmethod
def sample_run_config(cls):
return {
"model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}",
"tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}",
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
"enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}",
"gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}",
"model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
"tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
"max_tokens": "${env.MAX_TOKENS:4096}",
"enforce_eager": "${env.ENFORCE_EAGER:False}",
"gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
}
@field_validator("model")
@classmethod
def validate_model(cls, model: str) -> str:
permitted_models = supported_inference_models()
if model not in permitted_models:
model_list = "\n\t".join(permitted_models)
descriptors = [m.descriptor() for m in permitted_models]
repos = [m.huggingface_repo for m in permitted_models]
if model not in (descriptors + repos):
model_list = "\n\t".join(repos)
raise ValueError(
f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
)

View file

@ -4,11 +4,8 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_models.schema_utils import json_schema_type
from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
@json_schema_type
class BedrockConfig(BedrockBaseConfig):
pass

View file

@ -37,6 +37,18 @@ class InferenceEndpointImplConfig(BaseModel):
description="Your Hugging Face user access token (will default to locally saved token if not provided)",
)
@classmethod
def sample_run_config(
cls,
endpoint_name: str = "${env.INFERENCE_ENDPOINT_NAME}",
api_token: str = "${env.HF_API_TOKEN}",
**kwargs,
):
return {
"endpoint_name": endpoint_name,
"api_token": api_token,
}
@json_schema_type
class InferenceAPIImplConfig(BaseModel):
@ -47,3 +59,15 @@ class InferenceAPIImplConfig(BaseModel):
default=None,
description="Your Hugging Face user access token (will default to locally saved token if not provided)",
)
@classmethod
def sample_run_config(
cls,
repo: str = "${env.INFERENCE_MODEL}",
api_token: str = "${env.HF_API_TOKEN}",
**kwargs,
):
return {
"huggingface_repo": repo,
"api_token": api_token,
}

View file

@ -147,9 +147,7 @@ class ChromaMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
documents: List[MemoryBankDocument],
ttl_seconds: Optional[int] = None,
) -> None:
index = self.cache.get(bank_id, None)
if not index:
raise ValueError(f"Bank {bank_id} not found")
index = await self._get_and_cache_bank_index(bank_id)
await index.insert_documents(documents)
@ -159,8 +157,20 @@ class ChromaMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
query: InterleavedTextMedia,
params: Optional[Dict[str, Any]] = None,
) -> QueryDocumentsResponse:
index = self.cache.get(bank_id, None)
if not index:
raise ValueError(f"Bank {bank_id} not found")
index = await self._get_and_cache_bank_index(bank_id)
return await index.query_documents(query, params)
async def _get_and_cache_bank_index(self, bank_id: str) -> BankWithIndex:
if bank_id in self.cache:
return self.cache[bank_id]
bank = await self.memory_bank_store.get_memory_bank(bank_id)
if not bank:
raise ValueError(f"Bank {bank_id} not found in Llama Stack")
collection = await self.client.get_collection(bank_id)
if not collection:
raise ValueError(f"Bank {bank_id} not found in Chroma")
index = BankWithIndex(bank=bank, index=ChromaIndex(self.client, collection))
self.cache[bank_id] = index
return index

View file

@ -201,10 +201,7 @@ class PGVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
documents: List[MemoryBankDocument],
ttl_seconds: Optional[int] = None,
) -> None:
index = self.cache.get(bank_id, None)
if not index:
raise ValueError(f"Bank {bank_id} not found")
index = await self._get_and_cache_bank_index(bank_id)
await index.insert_documents(documents)
async def query_documents(
@ -213,8 +210,17 @@ class PGVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
query: InterleavedTextMedia,
params: Optional[Dict[str, Any]] = None,
) -> QueryDocumentsResponse:
index = self.cache.get(bank_id, None)
if not index:
raise ValueError(f"Bank {bank_id} not found")
index = await self._get_and_cache_bank_index(bank_id)
return await index.query_documents(query, params)
async def _get_and_cache_bank_index(self, bank_id: str) -> BankWithIndex:
if bank_id in self.cache:
return self.cache[bank_id]
bank = await self.memory_bank_store.get_memory_bank(bank_id)
index = BankWithIndex(
bank=bank,
index=PGVectorIndex(bank, ALL_MINILM_L6_V2_DIMENSION, self.cursor),
)
self.cache[bank_id] = index
return index

View file

@ -11,7 +11,6 @@ import pytest
#
# pytest -v -s llama_stack/providers/tests/inference/test_model_registration.py
# -m "meta_reference"
# --env TOGETHER_API_KEY=<your_api_key>
class TestModelRegistration:

View file

@ -5,11 +5,9 @@
# the root directory of this source tree.
from typing import Optional
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field
@json_schema_type
class BedrockBaseConfig(BaseModel):
aws_access_key_id: Optional[str] = Field(
default=None,
@ -57,3 +55,7 @@ class BedrockBaseConfig(BaseModel):
default=3600,
description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
)
@classmethod
def sample_run_config(cls, **kwargs):
return {}

View file

@ -22,9 +22,9 @@ def is_supported_safety_model(model: Model) -> bool:
]
def supported_inference_models() -> List[str]:
def supported_inference_models() -> List[Model]:
return [
m.descriptor()
m
for m in all_registered_models()
if (
m.model_family in {ModelFamily.llama3_1, ModelFamily.llama3_2}

View file

@ -178,7 +178,9 @@ def chat_completion_request_to_messages(
cprint(f"Could not resolve model {llama_model}", color="red")
return request.messages
if model.descriptor() not in supported_inference_models():
allowed_models = supported_inference_models()
descriptors = [m.descriptor() for m in allowed_models]
if model.descriptor() not in descriptors:
cprint(f"Unsupported inference model? {model.descriptor()}", color="red")
return request.messages

View file

@ -50,7 +50,7 @@ def process_template(template_dir: Path, progress) -> None:
template.save_distribution(
yaml_output_dir=REPO_ROOT / "llama_stack" / "templates" / template.name,
doc_output_dir=REPO_ROOT
/ "docs/source/getting_started/distributions"
/ "docs/source/distributions"
/ f"{template.distro_type}_distro",
)
else:
@ -103,7 +103,7 @@ def generate_dependencies_file():
deps_file = REPO_ROOT / "distributions" / "dependencies.json"
with open(deps_file, "w") as f:
json.dump(distribution_deps, f, indent=2)
f.write(json.dumps(distribution_deps, indent=2) + "\n")
def main():

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .bedrock import get_distribution_template # noqa: F401

View file

@ -0,0 +1,38 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::bedrock"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["remote::bedrock"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
return DistributionTemplate(
name="bedrock",
distro_type="self_hosted",
description="Use AWS Bedrock for running LLM inference and safety",
docker_image=None,
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[],
run_configs={
"run.yaml": RunConfigSettings(),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
},
)

View file

@ -1,9 +1,19 @@
version: '2'
name: bedrock
distribution_spec:
description: Use Amazon Bedrock APIs.
description: Use AWS Bedrock for running LLM inference and safety
docker_image: null
providers:
inference: remote::bedrock
memory: inline::faiss
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
inference:
- remote::bedrock
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- remote::bedrock
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,70 @@
# Bedrock Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
{{ providers_table }}
{% if run_config_env_vars %}
### Environment Variables
The following environment variables can be configured:
{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
{% endfor %}
{% endif %}
{% if default_models %}
### Models
The following models are available by default:
{% for model in default_models %}
- `{{ model.model_id }} ({{ model.provider_model_id }})`
{% endfor %}
{% endif %}
### Prerequisite: API Keys
Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
## Running Llama Stack with AWS Bedrock
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-{{ name }} \
--port $LLAMA_STACK_PORT \
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
```
### Via Conda
```bash
llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \
--port $LLAMA_STACK_PORT \
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
```

View file

@ -0,0 +1,49 @@
version: '2'
image_name: bedrock
docker_image: null
conda_env: bedrock
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: bedrock
provider_type: remote::bedrock
config: {}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/faiss_store.db
safety:
- provider_id: bedrock
provider_type: remote::bedrock
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/registry.db
models: []
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,9 +0,0 @@
name: databricks
distribution_spec:
description: Use Databricks for running LLM inference
providers:
inference: remote::databricks
memory: inline::faiss
safety: inline::llama-guard
agents: meta-reference
telemetry: meta-reference

View file

@ -1,5 +1,12 @@
# Fireworks Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
{{ providers_table }}
@ -43,9 +50,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
```
@ -55,6 +60,6 @@ docker run \
```bash
llama stack build --template fireworks --image-type conda
llama stack run ./run.yaml \
--port 5001 \
--port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
```

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .hf_endpoint import get_distribution_template # noqa: F401

View file

@ -1,9 +1,19 @@
version: '2'
name: hf-endpoint
distribution_spec:
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
docker_image: null
providers:
inference: remote::hf::endpoint
memory: inline::faiss
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
inference:
- remote::hf::endpoint
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,97 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::hf::endpoint"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="hf-endpoint",
provider_type="remote::hf::endpoint",
config=InferenceEndpointImplConfig.sample_run_config(),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="hf-endpoint",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="hf-endpoint-safety",
)
return DistributionTemplate(
name="hf-endpoint",
distro_type="self_hosted",
description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
docker_image=None,
template_path=None,
providers=providers,
default_models=[inference_model, safety_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
Provider(
provider_id="hf-endpoint-safety",
provider_type="remote::hf::endpoint",
config=InferenceEndpointImplConfig.sample_run_config(
endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}",
),
),
]
},
default_models=[
inference_model,
safety_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"HF_API_TOKEN": (
"hf_...",
"Hugging Face API token",
),
"INFERENCE_ENDPOINT_NAME": (
"",
"HF Inference endpoint name for the main inference model",
),
"SAFETY_INFERENCE_ENDPOINT_NAME": (
"",
"HF Inference endpoint for the safety model",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model served by the HF Inference Endpoint",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Safety model served by the HF Inference Endpoint",
),
},
)

View file

@ -0,0 +1,68 @@
version: '2'
image_name: hf-endpoint
docker_image: null
conda_env: hf-endpoint
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-endpoint
provider_type: remote::hf::endpoint
config:
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN}
- provider_id: hf-endpoint-safety
provider_type: remote::hf::endpoint
config:
endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-endpoint
provider_model_id: null
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: hf-endpoint-safety
provider_model_id: null
shields:
- params: null
shield_id: ${env.SAFETY_MODEL}
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,55 @@
version: '2'
image_name: hf-endpoint
docker_image: null
conda_env: hf-endpoint
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-endpoint
provider_type: remote::hf::endpoint
config:
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-endpoint
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .hf_serverless import get_distribution_template # noqa: F401

View file

@ -1,9 +1,19 @@
version: '2'
name: hf-serverless
distribution_spec:
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
docker_image: null
providers:
inference: remote::hf::serverless
memory: inline::faiss
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
inference:
- remote::hf::serverless
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,89 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::hf::serverless"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="hf-serverless",
provider_type="remote::hf::serverless",
config=InferenceAPIImplConfig.sample_run_config(),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="hf-serverless",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="hf-serverless-safety",
)
return DistributionTemplate(
name="hf-serverless",
distro_type="self_hosted",
description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
docker_image=None,
template_path=None,
providers=providers,
default_models=[inference_model, safety_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
Provider(
provider_id="hf-serverless-safety",
provider_type="remote::hf::serverless",
config=InferenceAPIImplConfig.sample_run_config(
repo="${env.SAFETY_MODEL}",
),
),
]
},
default_models=[
inference_model,
safety_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"HF_API_TOKEN": (
"hf_...",
"Hugging Face API token",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model to be served by the HF Serverless endpoint",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Safety model to be served by the HF Serverless endpoint",
),
},
)

View file

@ -0,0 +1,68 @@
version: '2'
image_name: hf-serverless
docker_image: null
conda_env: hf-serverless
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-serverless
provider_type: remote::hf::serverless
config:
huggingface_repo: ${env.INFERENCE_MODEL}
api_token: ${env.HF_API_TOKEN}
- provider_id: hf-serverless-safety
provider_type: remote::hf::serverless
config:
huggingface_repo: ${env.SAFETY_MODEL}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-serverless
provider_model_id: null
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: hf-serverless-safety
provider_model_id: null
shields:
- params: null
shield_id: ${env.SAFETY_MODEL}
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,55 @@
version: '2'
image_name: hf-serverless
docker_image: null
conda_env: hf-serverless
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-serverless
provider_type: remote::hf::serverless
config:
huggingface_repo: ${env.INFERENCE_MODEL}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-serverless
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,13 +0,0 @@
name: meta-reference-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: inline::meta-reference
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference

View file

@ -1,5 +1,12 @@
# Meta Reference Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
{{ providers_table }}
@ -40,9 +47,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
@ -53,9 +58,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@ -66,8 +69,8 @@ docker run \
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template meta-reference-gpu --image-type conda
llama stack run ./run.yaml \
llama stack build --template {{ name }} --image-type conda
llama stack run distributions/{{ name }}/run.yaml \
--port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
@ -75,7 +78,7 @@ llama stack run ./run.yaml \
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run ./run-with-safety.yaml \
llama stack run distributions/{{ name }}/run-with-safety.yaml \
--port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .meta_reference import get_distribution_template # noqa: F401

View file

@ -1,13 +1,19 @@
version: '2'
name: meta-reference-quantized-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
description: Use Meta Reference with fp8, int4 quantization for running LLM inference
docker_image: null
providers:
inference: meta-reference-quantized
inference:
- inline::meta-reference-quantized
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,87 @@
# Meta Reference Quantized Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
{{ providers_table }}
The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
{% if run_config_env_vars %}
### Environment Variables
The following environment variables can be configured:
{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
{% endfor %}
{% endif %}
## Prerequisite: Downloading Models
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
```
$ ls ~/.llama/checkpoints
Llama3.1-8B Llama3.2-11B-Vision-Instruct Llama3.2-1B-Instruct Llama3.2-90B-Vision-Instruct Llama-Guard-3-8B
Llama3.1-8B-Instruct Llama3.2-1B Llama3.2-3B-Instruct Llama-Guard-3-1B Prompt-Guard-86M
```
## Running the Distribution
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-{{ name }} \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-{{ name }} \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
```
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template {{ name }} --image-type conda
llama stack run distributions/{{ name }}/run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run distributions/{{ name }}/run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
```

View file

@ -0,0 +1,67 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.distribution.datatypes import ModelInput, Provider
from llama_stack.providers.inline.inference.meta_reference import (
MetaReferenceQuantizedInferenceConfig,
)
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["inline::meta-reference-quantized"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="meta-reference-inference",
provider_type="inline::meta-reference-quantized",
config=MetaReferenceQuantizedInferenceConfig.sample_run_config(
model="${env.INFERENCE_MODEL}",
checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="meta-reference-inference",
)
return DistributionTemplate(
name="meta-reference-quantized-gpu",
distro_type="self_hosted",
description="Use Meta Reference with fp8, int4 quantization for running LLM inference",
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[inference_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model loaded into the Meta Reference server",
),
"INFERENCE_CHECKPOINT_DIR": (
"null",
"Directory containing the Meta Reference model checkpoint",
),
},
)

View file

@ -0,0 +1,58 @@
version: '2'
image_name: meta-reference-quantized-gpu
docker_image: null
conda_env: meta-reference-quantized-gpu
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: meta-reference-inference
provider_type: inline::meta-reference-quantized
config:
model: ${env.INFERENCE_MODEL}
max_seq_len: 4096
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
quantization:
type: fp8
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: meta-reference-inference
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,5 +1,12 @@
# Ollama Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
{{ providers_table }}
@ -55,9 +62,7 @@ docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434
@ -86,7 +91,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash
export LLAMA_STACK_PORT=5001
llama stack build --template ollama --image-type conda
llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \

View file

@ -1,4 +1,10 @@
# Remote vLLM Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:

View file

@ -27,7 +27,7 @@ from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
class RunConfigSettings(BaseModel):
provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
default_models: List[ModelInput]
default_models: Optional[List[ModelInput]] = None
default_shields: Optional[List[ShieldInput]] = None
def run_config(
@ -87,7 +87,7 @@ class RunConfigSettings(BaseModel):
__distro_dir__=f"distributions/{name}",
db_name="registry.db",
),
models=self.default_models,
models=self.default_models or [],
shields=self.default_shields or [],
)
@ -104,7 +104,7 @@ class DistributionTemplate(BaseModel):
providers: Dict[str, List[str]]
run_configs: Dict[str, RunConfigSettings]
template_path: Path
template_path: Optional[Path] = None
# Optional configuration
run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
@ -159,6 +159,7 @@ class DistributionTemplate(BaseModel):
with open(yaml_output_dir / yaml_pth, "w") as f:
yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
docs = self.generate_markdown_docs()
with open(doc_output_dir / f"{self.name}.md", "w") as f:
f.write(docs)
if self.template_path:
docs = self.generate_markdown_docs()
with open(doc_output_dir / f"{self.name}.md", "w") as f:
f.write(docs if docs.endswith("\n") else docs + "\n")

View file

@ -1,5 +1,12 @@
# TGI Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
{{ providers_table }}
@ -71,9 +78,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@ -102,18 +107,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash
llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml
--port 5001
--env INFERENCE_MODEL=$INFERENCE_MODEL
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run ./run-with-safety.yaml
--port 5001
--env INFERENCE_MODEL=$INFERENCE_MODEL
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
--env SAFETY_MODEL=$SAFETY_MODEL
llama stack run ./run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
```

View file

@ -1,4 +1,11 @@
# Fireworks Distribution
# Together Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
@ -43,9 +50,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
```
@ -53,8 +58,8 @@ docker run \
### Via Conda
```bash
llama stack build --template together --image-type conda
llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \
--port 5001 \
--port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
```

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .vllm import get_distribution_template # noqa: F401

View file

@ -0,0 +1,19 @@
version: '2'
name: vllm-gpu
distribution_spec:
description: Use a built-in vLLM engine for running LLM inference
docker_image: null
providers:
inference:
- inline::vllm
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,58 @@
version: '2'
image_name: vllm-gpu
docker_image: null
conda_env: vllm-gpu
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: vllm
provider_type: inline::vllm
config:
model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
max_tokens: ${env.MAX_TOKENS:4096}
enforce_eager: ${env.ENFORCE_EAGER:False}
gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,74 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.datatypes import ModelInput, Provider
from llama_stack.providers.inline.inference.vllm import VLLMConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["inline::vllm"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="vllm",
provider_type="inline::vllm",
config=VLLMConfig.sample_run_config(),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="vllm",
)
return DistributionTemplate(
name="vllm-gpu",
distro_type="self_hosted",
description="Use a built-in vLLM engine for running LLM inference",
docker_image=None,
template_path=None,
providers=providers,
default_models=[inference_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model loaded into the vLLM engine",
),
"TENSOR_PARALLEL_SIZE": (
"1",
"Number of tensor parallel replicas (number of GPUs to use).",
),
"MAX_TOKENS": (
"4096",
"Maximum number of tokens to generate.",
),
"ENFORCE_EAGER": (
"False",
"Whether to use eager mode for inference (otherwise cuda graphs are used).",
),
"GPU_MEMORY_UTILIZATION": (
"0.7",
"GPU memory utilization for the vLLM engine.",
),
},
)

View file

@ -1,13 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "5af4f44e",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/00_Inference101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"id": "c1e7571c",

View file

@ -1,13 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "785bd3ff",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/01_Local_Cloud_Inference101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"id": "a0ed972d",

View file

@ -1,13 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d2bf5275",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/02_Prompt_Engineering101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"id": "cd96f85a",

View file

@ -1,13 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6323a6be",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/03_Image_Chat101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"id": "923343b0-d4bd-4361-b8d4-dd29f86a0fbd",

View file

@ -1,12 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/04_Tool_Calling101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},

View file

@ -1,12 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/05_Memory101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},

View file

@ -1,12 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/06_Safety101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -18,7 +11,7 @@
"As outlined in our [Responsible Use Guide](https://www.llama.com/docs/how-to-guides/responsible-use-guide-resources/), LLM apps should deploy appropriate system level safeguards to mitigate safety and security risks of LLM system, similar to the following diagram:\n",
"\n",
"<div>\n",
"<img src=\"../_static/safety_system.webp\" alt=\"Figure 1: Safety System\" width=\"1000\"/>\n",
"<img src=\"/docs/_static/safety_system.webp\" alt=\"Figure 1: Safety System\" width=\"1000\"/>\n",
"</div>\n",
"To that goal, Llama Stack uses **Prompt Guard** and **Llama Guard 3** to secure our system. Here are the quick introduction about them.\n"
]

View file

@ -1,12 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/07_Agents101.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},