Merge branch 'meta-llama:main' into qdrant

This commit is contained in:
Anush 2024-10-22 21:45:31 +05:30 committed by GitHub
commit 1575578446
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
101 changed files with 3310 additions and 722 deletions

View file

@ -1,6 +1,9 @@
name: Pre-commit name: Pre-commit
on: [pull_request] on:
pull_request:
push:
branches: [main]
jobs: jobs:
pre-commit: pre-commit:
@ -19,27 +22,4 @@ jobs:
**/requirements*.txt **/requirements*.txt
.pre-commit-config.yaml .pre-commit-config.yaml
- name: Install pre-commit - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd #v3.0.1
run: |
python -m pip install --upgrade pip
pip install pre-commit
- name: Fetch base branch
run: git fetch origin ${{ github.event.pull_request.base.ref }}:refs/remotes/origin/${{ github.event.pull_request.base.ref }}
- name: Fetch head commit from PR
run: git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
- name: Get changed files
id: changed-files
run: |
git diff --name-only origin/${{ github.event.pull_request.base.ref }} pr-${{ github.event.pull_request.number }} > changed_files.txt
cat changed_files.txt
- name: Run pre-commit
run: |
if [ -s changed_files.txt ]; then
pre-commit run --files $(cat changed_files.txt | tr '\n' ' ')
else
echo "No changed files to run pre-commit on."
fi

1
.gitignore vendored
View file

@ -15,3 +15,4 @@ Package.resolved
*.ipynb_checkpoints* *.ipynb_checkpoints*
.venv/ .venv/
.idea .idea
_build

32
.readthedocs.yaml Normal file
View file

@ -0,0 +1,32 @@
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the OS, Python version and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.12"
# You can also specify other tool versions:
# nodejs: "19"
# rust: "1.64"
# golang: "1.19"
# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/source/conf.py
# Optionally build your docs in additional formats such as PDF and ePub
# formats:
# - pdf
# - epub
# Optional but recommended, declare the Python requirements required
# to build your documentation
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- requirements: docs/requirements.txt

View file

@ -92,9 +92,16 @@ The `llama` CLI makes it easy to work with the Llama Stack set of tools. Please
* [CLI reference](docs/cli_reference.md) * [CLI reference](docs/cli_reference.md)
* Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution. * Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
* [Getting Started](docs/getting_started.md) * [Getting Started](docs/getting_started.md)
* Guide to build and run a Llama Stack server. * Quick guide to start a Llama Stack server.
* [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
* [Building a Llama Stack Distribution](docs/building_distro.md)
* Guide to build a Llama Stack distribution
* [Distributions](./distributions/)
* References to start Llama Stack distributions backed with different API providers.
* [Developer Cookbook](./docs/developer_cookbook.md)
* References to guides to help you get started based on your developer needs.
* [Contributing](CONTRIBUTING.md) * [Contributing](CONTRIBUTING.md)
* [Adding a new API Provider](./docs/new_api_provider.md) to walk-through how to add a new API provider.
## Llama Stack Client SDK ## Llama Stack Client SDK
@ -106,3 +113,5 @@ The `llama` CLI makes it easy to work with the Llama Stack set of tools. Please
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | | Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) |
Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications. Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.

13
distributions/README.md Normal file
View file

@ -0,0 +1,13 @@
# Llama Stack Distribution
A Distribution is where APIs and Providers are assembled together to provide a consistent whole to the end application developer. You can mix-and-match providers -- some could be backed by local code and some could be remote. As a hobbyist, you can serve a small model locally, but can choose a cloud provider for a large model. Regardless, the higher level APIs your app needs to work with don't need to change at all. You can even imagine moving across the server / mobile-device boundary as well always using the same uniform set of APIs for developing Generative AI applications.
## Quick Start Llama Stack Distributions Guide
| **Distribution** | **Llama Stack Docker** | Start This Distribution | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|:----------------: |:------------------------------------------: |:-----------------------: |:------------------: |:------------------: |:------------------: |:------------------: |:------------------: |
| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](./meta-reference-gpu/) | meta-reference | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference |
| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](./ollama/) | remote::ollama | meta-reference | remote::pgvector; remote::chromadb | remote::ollama | meta-reference |
| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](./tgi/) | remote::tgi | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference |
| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](./together/) | remote::together | meta-reference | remote::weaviate | meta-reference | meta-reference |
| Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](./fireworks/) | remote::fireworks | meta-reference | remote::weaviate | meta-reference | meta-reference |

View file

@ -1,4 +1,4 @@
name: local-bedrock-conda-example name: bedrock
distribution_spec: distribution_spec:
description: Use Amazon Bedrock APIs. description: Use Amazon Bedrock APIs.
providers: providers:

View file

@ -1,4 +1,4 @@
name: local-databricks name: databricks
distribution_spec: distribution_spec:
description: Use Databricks for running LLM inference description: Use Databricks for running LLM inference
providers: providers:
@ -7,4 +7,4 @@ distribution_spec:
safety: meta-reference safety: meta-reference
agents: meta-reference agents: meta-reference
telemetry: meta-reference telemetry: meta-reference
image_type: conda image_type: conda

View file

@ -0,0 +1,55 @@
# Fireworks Distribution
The `llamastack/distribution-` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- |
| **Provider(s)** | remote::fireworks | meta-reference | meta-reference | meta-reference | meta-reference |
### Start the Distribution (Single Node CPU)
> [!NOTE]
> This assumes you have an hosted endpoint at Fireworks with API Key.
```
$ cd llama-stack/distribution/fireworks
$ ls
compose.yaml run.yaml
$ docker compose up
```
Make sure in you `run.yaml` file, you inference provider is pointing to the correct Fireworks URL server endpoint. E.g.
```
inference:
- provider_id: fireworks
provider_type: remote::fireworks
config:
url: https://api.fireworks.ai/inferenc
api_key: <optional api key>
```
### (Alternative) TGI server + llama stack run (Single Node GPU)
```
docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-fireworks --yaml_config /root/my-run.yaml
```
Make sure in you `run.yaml` file, you inference provider is pointing to the correct Fireworks URL server endpoint. E.g.
```
inference:
- provider_id: fireworks
provider_type: remote::fireworks
config:
url: https://api.fireworks.ai/inference
api_key: <optional api key>
```
**Via Conda**
```bash
llama stack build --config ./build.yaml
# -- modify run.yaml to a valid Fireworks server endpoint
llama stack run ./run.yaml
```

View file

@ -1,4 +1,4 @@
name: local-fireworks name: fireworks
distribution_spec: distribution_spec:
description: Use Fireworks.ai for running LLM inference description: Use Fireworks.ai for running LLM inference
providers: providers:
@ -7,4 +7,4 @@ distribution_spec:
safety: meta-reference safety: meta-reference
agents: meta-reference agents: meta-reference
telemetry: meta-reference telemetry: meta-reference
image_type: conda image_type: docker

View file

@ -0,0 +1,18 @@
services:
llamastack:
image: llamastack/distribution-fireworks
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ./run.yaml:/root/llamastack-run-fireworks.yaml
ports:
- "5000:5000"
# Hack: wait for ollama server to start before starting docker
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1,46 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: fireworks0
provider_type: remote::fireworks
config:
url: https://api.fireworks.ai/inference
safety:
- provider_id: meta0
provider_type: meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
prompt_guard_shield:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: meta-reference
config: {}

View file

@ -1,4 +1,4 @@
name: local-hf-endpoint name: hf-endpoint
distribution_spec: distribution_spec:
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints." description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
providers: providers:

View file

@ -1,4 +1,4 @@
name: local-hf-serverless name: hf-serverless
distribution_spec: distribution_spec:
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference." description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
providers: providers:

View file

@ -0,0 +1,28 @@
# Meta Reference Distribution
The `llamastack/distribution-meta-reference-gpu` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- |
| **Provider(s)** | meta-reference | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference |
### Start the Distribution (Single Node GPU)
> [!NOTE]
> This assumes you have access to GPU to start a local server with access to your GPU.
> [!NOTE]
> `~/.llama` should be the path containing downloaded weights of Llama models.
To download and start running a pre-built docker container, you may use the following commands:
```
docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml
```
### Alternative (Build and start distribution locally via conda)
- You may checkout the [Getting Started](../../docs/getting_started.md) for more details on building locally via conda and starting up a meta-reference distribution.

View file

@ -1,10 +1,13 @@
name: local name: meta-reference-gpu
distribution_spec: distribution_spec:
description: Use code from `llama_stack` itself to serve all llama stack APIs description: Use code from `llama_stack` itself to serve all llama stack APIs
providers: providers:
inference: meta-reference inference: meta-reference
memory: meta-reference memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference safety: meta-reference
agents: meta-reference agents: meta-reference
telemetry: meta-reference telemetry: meta-reference
image_type: conda image_type: docker

View file

@ -1,19 +1,19 @@
version: '2' version: '2'
built_at: '2024-10-08T17:42:33.690666' built_at: '2024-10-08T17:40:45.325529'
image_name: local-gpu image_name: local
docker_image: local-gpu docker_image: null
conda_env: null conda_env: local
apis: apis:
- memory
- inference
- agents
- shields - shields
- safety - agents
- models - models
- memory
- memory_banks - memory_banks
- inference
- safety
providers: providers:
inference: inference:
- provider_id: meta-reference - provider_id: meta0
provider_type: meta-reference provider_type: meta-reference
config: config:
model: Llama3.1-8B-Instruct model: Llama3.1-8B-Instruct
@ -22,17 +22,22 @@ providers:
max_seq_len: 4096 max_seq_len: 4096
max_batch_size: 1 max_batch_size: 1
safety: safety:
- provider_id: meta-reference - provider_id: meta0
provider_type: meta-reference provider_type: meta-reference
config: config:
llama_guard_shield: null llama_guard_shield:
prompt_guard_shield: null model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
prompt_guard_shield:
model: Prompt-Guard-86M
memory: memory:
- provider_id: meta-reference - provider_id: meta0
provider_type: meta-reference provider_type: meta-reference
config: {} config: {}
agents: agents:
- provider_id: meta-reference - provider_id: meta0
provider_type: meta-reference provider_type: meta-reference
config: config:
persistence_store: persistence_store:
@ -40,6 +45,6 @@ providers:
type: sqlite type: sqlite
db_path: ~/.llama/runtime/kvstore.db db_path: ~/.llama/runtime/kvstore.db
telemetry: telemetry:
- provider_id: meta-reference - provider_id: meta0
provider_type: meta-reference provider_type: meta-reference
config: {} config: {}

View file

@ -0,0 +1,91 @@
# Ollama Distribution
The `llamastack/distribution-ollama` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |---------------- |---------------- |---------------------------------- |---------------- |---------------- |
| **Provider(s)** | remote::ollama | meta-reference | remote::pgvector, remote::chroma | remote::ollama | meta-reference |
### Start a Distribution (Single Node GPU)
> [!NOTE]
> This assumes you have access to GPU to start a Ollama server with access to your GPU.
```
$ cd llama-stack/distribution/ollama/gpu
$ ls
compose.yaml run.yaml
$ docker compose up
```
You will see outputs similar to following ---
```
[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps"
[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps"
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
[llamastack] | Resolved 12 providers
[llamastack] | inner-inference => ollama0
[llamastack] | models => __routing_table__
[llamastack] | inference => __autorouted__
```
To kill the server
```
docker compose down
```
### Start the Distribution (Single Node CPU)
> [!NOTE]
> This will start an ollama server with CPU only, please see [Ollama Documentations](https://github.com/ollama/ollama) for serving models on CPU only.
```
$ cd llama-stack/distribution/ollama/cpu
$ ls
compose.yaml run.yaml
$ docker compose up
```
### (Alternative) ollama run + llama stack Run
If you wish to separately spin up a Ollama server, and connect with Llama Stack, you may use the following commands.
#### Start Ollama server.
- Please check the [Ollama Documentations](https://github.com/ollama/ollama) for more details.
**Via Docker**
```
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
```
**Via CLI**
```
ollama run <model_id>
```
#### Start Llama Stack server pointing to Ollama server
**Via Docker**
```
docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
```
Make sure in you `run.yaml` file, you inference provider is pointing to the correct Ollama endpoint. E.g.
```
inference:
- provider_id: ollama0
provider_type: remote::ollama
config:
url: http://127.0.0.1:14343
```
**Via Conda**
```
llama stack build --config ./build.yaml
llama stack run ./gpu/run.yaml
```

View file

@ -0,0 +1,13 @@
name: ollama
distribution_spec:
description: Use ollama for running LLM inference
providers:
inference: remote::ollama
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1,30 @@
services:
ollama:
image: ollama/ollama:latest
network_mode: "host"
volumes:
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
ports:
- "11434:11434"
command: []
llamastack:
depends_on:
- ollama
image: llamastack/llamastack-local-cpu
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
# Hack: wait for ollama server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
volumes:
ollama:

View file

@ -1,35 +1,39 @@
version: '2' version: '2'
built_at: '2024-10-08T17:42:07.505267' built_at: '2024-10-08T17:40:45.325529'
image_name: local-cpu image_name: local
docker_image: local-cpu docker_image: null
conda_env: null conda_env: local
apis: apis:
- shields
- agents - agents
- inference
- models - models
- memory - memory
- safety
- shields
- memory_banks - memory_banks
- inference
- safety
providers: providers:
inference: inference:
- provider_id: remote::ollama - provider_id: ollama0
provider_type: remote::ollama provider_type: remote::ollama
config: config:
host: localhost url: http://127.0.0.1:14343
port: 6000
safety: safety:
- provider_id: meta-reference - provider_id: meta0
provider_type: meta-reference provider_type: meta-reference
config: config:
llama_guard_shield: null llama_guard_shield:
prompt_guard_shield: null model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
prompt_guard_shield:
model: Prompt-Guard-86M
memory: memory:
- provider_id: meta-reference - provider_id: meta0
provider_type: meta-reference provider_type: meta-reference
config: {} config: {}
agents: agents:
- provider_id: meta-reference - provider_id: meta0
provider_type: meta-reference provider_type: meta-reference
config: config:
persistence_store: persistence_store:
@ -37,6 +41,6 @@ providers:
type: sqlite type: sqlite
db_path: ~/.llama/runtime/kvstore.db db_path: ~/.llama/runtime/kvstore.db
telemetry: telemetry:
- provider_id: meta-reference - provider_id: meta0
provider_type: meta-reference provider_type: meta-reference
config: {} config: {}

View file

@ -0,0 +1,48 @@
services:
ollama:
image: ollama/ollama:latest
network_mode: "host"
volumes:
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
ports:
- "11434:11434"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
llamastack-local-cpu:
depends_on:
- ollama
image: llamastack/llamastack-local-cpu
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ./run.yaml:/root/llamastack-run-ollama.yaml
ports:
- "5000:5000"
# Hack: wait for ollama server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
volumes:
ollama:

View file

@ -0,0 +1,46 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: ollama0
provider_type: remote::ollama
config:
url: http://127.0.0.1:14343
safety:
- provider_id: meta0
provider_type: meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
prompt_guard_shield:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: meta-reference
config: {}

View file

@ -0,0 +1,94 @@
# TGI Distribution
The `llamastack/distribution-tgi` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- |
| **Provider(s)** | remote::tgi | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference |
### Start the Distribution (Single Node GPU)
> [!NOTE]
> This assumes you have access to GPU to start a TGI server with access to your GPU.
```
$ cd llama_stack/distribution/docker/tgi
$ ls
compose.yaml tgi-run.yaml
$ docker compose up
```
The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs --
```
[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama)
[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0
[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
```
To kill the server
```
docker compose down
```
### Start the Distribution (Single Node CPU)
> [!NOTE]
> This assumes you have an hosted endpoint compatible with TGI server.
```
$ cd llama-stack/distribution/tgi/cpu
$ ls
compose.yaml run.yaml
$ docker compose up
```
Replace <ENTER_YOUR_TGI_HOSTED_ENDPOINT> in `run.yaml` file with your TGI endpoint.
```
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: <ENTER_YOUR_TGI_HOSTED_ENDPOINT>
```
### (Alternative) TGI server + llama stack run (Single Node GPU)
If you wish to separately spin up a TGI server, and connect with Llama Stack, you may use the following commands.
#### (optional) Start TGI server locally
- Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint.
```
docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.1-8B-Instruct --port 5009
```
#### Start Llama Stack server pointing to TGI server
```
docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack-local-cpu --yaml_config /root/my-run.yaml
```
Make sure in you `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g.
```
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009
```
**Via Conda**
```bash
llama stack build --config ./build.yaml
# -- start a TGI server endpoint
llama stack run ./gpu/run.yaml
```

View file

@ -0,0 +1,13 @@
name: tgi
distribution_spec:
description: Use TGI for running LLM inference
providers:
inference: remote::tgi
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1,33 @@
services:
text-generation-inference:
image: ghcr.io/huggingface/text-generation-inference:latest
network_mode: "host"
volumes:
- $HOME/.cache/huggingface:/data
ports:
- "5009:5009"
command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
runtime: nvidia
healthcheck:
test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
interval: 5s
timeout: 5s
retries: 30
llamastack:
depends_on:
text-generation-inference:
condition: service_healthy
image: llamastack/llamastack-local-cpu
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to run.yaml file
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1,46 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: <ENTER_YOUR_TGI_HOSTED_ENDPOINT>
safety:
- provider_id: meta0
provider_type: meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
prompt_guard_shield:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: meta-reference
config: {}

View file

@ -0,0 +1,55 @@
services:
text-generation-inference:
image: ghcr.io/huggingface/text-generation-inference:latest
network_mode: "host"
volumes:
- $HOME/.cache/huggingface:/data
ports:
- "5009:5009"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
healthcheck:
test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
interval: 5s
timeout: 5s
retries: 30
llamastack:
depends_on:
text-generation-inference:
condition: service_healthy
image: llamastack/llamastack-local-cpu
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to TGI run.yaml file
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
# Hack: wait for TGI server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1,46 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009
safety:
- provider_id: meta0
provider_type: meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
prompt_guard_shield:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: meta-reference
config: {}

View file

@ -0,0 +1,68 @@
# Together Distribution
### Connect to a Llama Stack Together Endpoint
- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
The `llamastack/distribution-together` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- |
| **Provider(s)** | remote::together | meta-reference | remote::weaviate | meta-reference | meta-reference |
### Start the Distribution (Single Node CPU)
> [!NOTE]
> This assumes you have an hosted endpoint at Together with API Key.
```
$ cd llama-stack/distribution/together
$ ls
compose.yaml run.yaml
$ docker compose up
```
Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
```
inference:
- provider_id: together
provider_type: remote::together
config:
url: https://api.together.xyz/v1
api_key: <optional api key>
```
### (Alternative) TGI server + llama stack run (Single Node GPU)
```
docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-together --yaml_config /root/my-run.yaml
```
Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
```
inference:
- provider_id: together
provider_type: remote::together
config:
url: https://api.together.xyz/v1
api_key: <optional api key>
```
Together distribution comes with weaviate as Memory provider. We also need to configure the remote weaviate API key and URL in `run.yaml` to get memory API.
```
memory:
- provider_id: meta0
provider_type: remote::weaviate
config:
weaviate_api_key: <ENTER_WEAVIATE_API_KEY>
weaviate_cluster_url: <ENTER_WEAVIATE_CLUSTER_URL>
```
**Via Conda**
```bash
llama stack build --config ./build.yaml
# -- modify run.yaml to a valid Together server endpoint
llama stack run ./run.yaml
```

View file

@ -1,10 +1,10 @@
name: local-together name: together
distribution_spec: distribution_spec:
description: Use Together.ai for running LLM inference description: Use Together.ai for running LLM inference
providers: providers:
inference: remote::together inference: remote::together
memory: meta-reference memory: remote::weaviate
safety: remote::together safety: remote::together
agents: meta-reference agents: meta-reference
telemetry: meta-reference telemetry: meta-reference
image_type: conda image_type: docker

View file

@ -0,0 +1,18 @@
services:
llamastack:
image: llamastack/distribution-together
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ./run.yaml:/root/llamastack-run-together.yaml
ports:
- "5000:5000"
# Hack: wait for ollama server to start before starting docker
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1,42 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: together0
provider_type: remote::together
config:
url: https://api.together.xyz/v1
safety:
- provider_id: together0
provider_type: remote::together
config:
url: https://api.together.xyz/v1
memory:
- provider_id: meta0
provider_type: remote::weaviate
config:
weaviate_api_key: <ENTER_WEAVIATE_API_KEY>
weaviate_cluster_url: <ENTER_WEAVIATE_CLUSTER_URL>
agents:
- provider_id: meta0
provider_type: meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: meta-reference
config: {}

View file

@ -1,4 +1,4 @@
name: local-vllm name: vllm
distribution_spec: distribution_spec:
description: Like local, but use vLLM for running LLM inference description: Like local, but use vLLM for running LLM inference
providers: providers:
@ -7,4 +7,4 @@ distribution_spec:
safety: meta-reference safety: meta-reference
agents: meta-reference agents: meta-reference
telemetry: meta-reference telemetry: meta-reference
image_type: conda image_type: conda

20
docs/Makefile Normal file
View file

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

BIN
docs/_static/llama-stack-logo.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

BIN
docs/_static/llama-stack.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

270
docs/building_distro.md Normal file
View file

@ -0,0 +1,270 @@
# Building a Llama Stack Distribution
This guide will walk you through the steps to get started with building a Llama Stack distributiom from scratch with your choice of API providers. Please see the [Getting Started Guide](./getting_started.md) if you just want the basic steps to start a Llama Stack distribution.
## Step 1. Build
In the following steps, imagine we'll be working with a `Meta-Llama3.1-8B-Instruct` model. We will name our build `8b-instruct` to help us remember the config. We will start build our distribution (in the form of a Conda environment, or Docker image). In this step, we will specify:
- `name`: the name for our distribution (e.g. `8b-instruct`)
- `image_type`: our build image type (`conda | docker`)
- `distribution_spec`: our distribution specs for specifying API providers
- `description`: a short description of the configurations for the distribution
- `providers`: specifies the underlying implementation for serving each API endpoint
- `image_type`: `conda` | `docker` to specify whether to build the distribution in the form of Docker image or Conda environment.
At the end of build command, we will generate `<name>-build.yaml` file storing the build configurations.
After this step is complete, a file named `<name>-build.yaml` will be generated and saved at the output file path specified at the end of the command.
#### Building from scratch
- For a new user, we could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
```
llama stack build
```
Running the command above will allow you to fill in the configuration to build your Llama Stack distribution, you will see the following outputs.
```
> Enter an unique name for identifying your Llama Stack build distribution (e.g. my-local-stack): 8b-instruct
> Enter the image type you want your distribution to be built with (docker or conda): conda
Llama Stack is composed of several APIs working together. Let's configure the providers (implementations) you want to use for these APIs.
> Enter the API provider for the inference API: (default=meta-reference): meta-reference
> Enter the API provider for the safety API: (default=meta-reference): meta-reference
> Enter the API provider for the agents API: (default=meta-reference): meta-reference
> Enter the API provider for the memory API: (default=meta-reference): meta-reference
> Enter the API provider for the telemetry API: (default=meta-reference): meta-reference
> (Optional) Enter a short description for your Llama Stack distribution:
Build spec configuration saved at ~/.conda/envs/llamastack-my-local-llama-stack/8b-instruct-build.yaml
```
**Ollama (optional)**
If you plan to use Ollama for inference, you'll need to install the server [via these instructions](https://ollama.com/download).
#### Building from templates
- To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
The following command will allow you to see the available templates and their corresponding providers.
```
llama stack build --list-templates
```
![alt text](resources/list-templates.png)
You may then pick a template to build your distribution with providers fitted to your liking.
```
llama stack build --template local-tgi --name my-tgi-stack
```
```
$ llama stack build --template local-tgi --name my-tgi-stack
...
...
Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml`
```
#### Building from config file
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/distributions/templates/`.
```
$ cat llama_stack/distribution/templates/local-ollama-build.yaml
name: local-ollama
distribution_spec:
description: Like local, but use ollama for running LLM inference
providers:
inference: remote::ollama
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda
```
```
llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml
```
#### How to build distribution with Docker image
> [!TIP]
> Podman is supported as an alternative to Docker. Set `DOCKER_BINARY` to `podman` in your environment to use Podman.
To build a docker image, you may start off from a template and use the `--image-type docker` flag to specify `docker` as the build image type.
```
llama stack build --template local --image-type docker --name docker-0
```
Alternatively, you may use a config file and set `image_type` to `docker` in our `<name>-build.yaml` file, and run `llama stack build <name>-build.yaml`. The `<name>-build.yaml` will be of contents like:
```
name: local-docker-example
distribution_spec:
description: Use code from `llama_stack` itself to serve all llama stack APIs
docker_image: null
providers:
inference: meta-reference
memory: meta-reference-faiss
safety: meta-reference
agentic_system: meta-reference
telemetry: console
image_type: docker
```
The following command allows you to build a Docker image with the name `<name>`
```
llama stack build --config <name>-build.yaml
Dockerfile created successfully in /tmp/tmp.I0ifS2c46A/DockerfileFROM python:3.10-slim
WORKDIR /app
...
...
You can run it with: podman run -p 8000:8000 llamastack-docker-local
Build spec configuration saved at ~/.llama/distributions/docker/docker-local-build.yaml
```
## Step 2. Configure
After our distribution is built (either in form of docker or conda environment), we will run the following command to
```
llama stack configure [ <name> | <docker-image-name> | <path/to/name.build.yaml>]
```
- For `conda` environments: <path/to/name.build.yaml> would be the generated build spec saved from Step 1.
- For `docker` images downloaded from Dockerhub, you could also use <docker-image-name> as the argument.
- Run `docker images` to check list of available images on your machine.
```
$ llama stack configure 8b-instruct
Configuring API: inference (meta-reference)
Enter value for model (existing: Meta-Llama3.1-8B-Instruct) (required):
Enter value for quantization (optional):
Enter value for torch_seed (optional):
Enter value for max_seq_len (existing: 4096) (required):
Enter value for max_batch_size (existing: 1) (required):
Configuring API: memory (meta-reference-faiss)
Configuring API: safety (meta-reference)
Do you want to configure llama_guard_shield? (y/n): y
Entering sub-configuration for llama_guard_shield:
Enter value for model (default: Llama-Guard-3-1B) (required):
Enter value for excluded_categories (default: []) (required):
Enter value for disable_input_check (default: False) (required):
Enter value for disable_output_check (default: False) (required):
Do you want to configure prompt_guard_shield? (y/n): y
Entering sub-configuration for prompt_guard_shield:
Enter value for model (default: Prompt-Guard-86M) (required):
Configuring API: agentic_system (meta-reference)
Enter value for brave_search_api_key (optional):
Enter value for bing_search_api_key (optional):
Enter value for wolfram_api_key (optional):
Configuring API: telemetry (console)
YAML configuration has been written to ~/.llama/builds/conda/8b-instruct-run.yaml
```
After this step is successful, you should be able to find a run configuration spec in `~/.llama/builds/conda/8b-instruct-run.yaml` with the following contents. You may edit this file to change the settings.
As you can see, we did basic configuration above and configured:
- inference to run on model `Meta-Llama3.1-8B-Instruct` (obtained from `llama model list`)
- Llama Guard safety shield with model `Llama-Guard-3-1B`
- Prompt Guard safety shield with model `Prompt-Guard-86M`
For how these configurations are stored as yaml, checkout the file printed at the end of the configuration.
Note that all configurations as well as models are stored in `~/.llama`
## Step 3. Run
Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack configure` step.
```
llama stack run 8b-instruct
```
You should see the Llama Stack server start and print the APIs that it is supporting
```
$ llama stack run 8b-instruct
> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1
Loaded in 19.28 seconds
NCCL version 2.20.5+cuda12.4
Finished model load YES READY
Serving POST /inference/batch_chat_completion
Serving POST /inference/batch_completion
Serving POST /inference/chat_completion
Serving POST /inference/completion
Serving POST /safety/run_shield
Serving POST /agentic_system/memory_bank/attach
Serving POST /agentic_system/create
Serving POST /agentic_system/session/create
Serving POST /agentic_system/turn/create
Serving POST /agentic_system/delete
Serving POST /agentic_system/session/delete
Serving POST /agentic_system/memory_bank/detach
Serving POST /agentic_system/session/get
Serving POST /agentic_system/step/get
Serving POST /agentic_system/turn/get
Listening on :::5000
INFO: Started server process [453333]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
```
> [!NOTE]
> Configuration is in `~/.llama/builds/local/conda/8b-instruct-run.yaml`. Feel free to increase `max_seq_len`.
> [!IMPORTANT]
> The "local" distribution inference server currently only supports CUDA. It will not work on Apple Silicon machines.
> [!TIP]
> You might need to use the flag `--disable-ipv6` to Disable IPv6 support
This server is running a Llama model locally.
## Step 4. Test with Client
Once the server is setup, we can test it with a client to see the example outputs.
```
cd /path/to/llama-stack
conda activate <env> # any environment containing the llama-stack pip package will work
python -m llama_stack.apis.inference.client localhost 5000
```
This will run the chat completion client and query the distributions /inference/chat_completion API.
Here is an example output:
```
User>hello world, write me a 2 sentence poem about the moon
Assistant> Here's a 2-sentence poem about the moon:
The moon glows softly in the midnight sky,
A beacon of wonder, as it passes by.
```
Similarly you can test safety (if you configured llama-guard and/or prompt-guard shields) by:
```
python -m llama_stack.apis.safety.client localhost 5000
```
Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.

View file

@ -0,0 +1,41 @@
# Llama Stack Developer Cookbook
Based on your developer needs, below are references to guides to help you get started.
### Hosted Llama Stack Endpoint
* Developer Need: I want to connect to a Llama Stack endpoint to build my applications.
* Effort: 1min
* Guide:
- Checkout our [DeepLearning course](https://www.deeplearning.ai/short-courses/introducing-multimodal-llama-3-2) on building with Llama Stack apps on pre-hosted Llama Stack endpoint.
### Local meta-reference Llama Stack Server
* Developer Need: I want to start a local Llama Stack server with my GPU using meta-reference implementations.
* Effort: 5min
* Guide:
- Please see our [Getting Started Guide](./getting_started.md) on starting up a meta-reference Llama Stack server.
### Llama Stack Server with Remote Providers
* Developer need: I want a Llama Stack distribution with a remote provider.
* Effort: 10min
* Guide
- Please see our [Distributions Guide](../distributions/) on starting up distributions with remote providers.
### On-Device (iOS) Llama Stack
* Developer Need: I want to use Llama Stack on-Device
* Effort: 1.5hr
* Guide:
- Please see our [iOS Llama Stack SDK](../llama_stack/providers/impls/ios/inference) implementations
### Assemble your own Llama Stack Distribution
* Developer Need: I want to assemble my own distribution with API providers to my likings
* Effort: 30min
* Guide
- Please see our [Building Distribution](./building_distro.md) guide for assembling your own Llama Stack distribution with your choice of API providers.
### Adding a New API Provider
* Developer Need: I want to add a new API provider to Llama Stack.
* Effort: 3hr
* Guide
- Please see our [Adding a New API Provider](./new_api_provider.md) guide for adding a new API provider.

View file

@ -43,11 +43,9 @@
"For this purpose, we will directly work with pre-built docker containers and use the python SDK\n", "For this purpose, we will directly work with pre-built docker containers and use the python SDK\n",
"```\n", "```\n",
"$ git clone https://github.com/meta-llama/llama-stack-apps.git\n", "$ git clone https://github.com/meta-llama/llama-stack-apps.git\n",
"\n",
"$ cd llama-stack-apps\n", "$ cd llama-stack-apps\n",
"$ yes | conda create -n stack-test python=3.10 \n", "$ yes | conda create -n stack-test python=3.10 \n",
"$ conda activate stack-test\n", "$ conda activate stack-test\n",
"\n",
"$ pip install llama_stack llama_stack_client\n", "$ pip install llama_stack llama_stack_client\n",
"```\n", "```\n",
"This will install `llama_stack` and `llama_stack_client` packages. \n", "This will install `llama_stack` and `llama_stack_client` packages. \n",

View file

@ -1,45 +1,9 @@
# llama-stack # Getting Started with Llama Stack
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
This repository contains the specifications and implementations of the APIs which are part of the Llama Stack.
The Llama Stack defines and standardizes the building blocks needed to bring generative AI applications to market. These blocks span the entire development lifecycle: from model training and fine-tuning, through product evaluation, to invoking AI agents in production. Beyond definition, we're developing open-source versions and partnering with cloud providers, ensuring developers can assemble AI solutions using consistent, interlocking pieces across platforms. The ultimate goal is to accelerate innovation in the AI space.
The Stack APIs are rapidly improving, but still very much work in progress and we invite feedback as well as direct contributions.
## APIs
The Llama Stack consists of the following set of APIs:
- Inference
- Safety
- Memory
- Agentic System
- Evaluation
- Post Training
- Synthetic Data Generation
- Reward Scoring
Each of the APIs themselves is a collection of REST endpoints.
## API Providers
A Provider is what makes the API real -- they provide the actual implementation backing the API.
As an example, for Inference, we could have the implementation be backed by open source libraries like `[ torch | vLLM | TensorRT ]` as possible options.
A provider can also be just a pointer to a remote REST service -- for example, cloud providers or dedicated inference providers could serve these APIs.
## Llama Stack Distribution
A Distribution is where APIs and Providers are assembled together to provide a consistent whole to the end application developer. You can mix-and-match providers -- some could be backed by local code and some could be remote. As a hobbyist, you can serve a small model locally, but can choose a cloud provider for a large model. Regardless, the higher level APIs your app needs to work with don't need to change at all. You can even imagine moving across the server / mobile-device boundary as well always using the same uniform set of APIs for developing Generative AI applications.
This guide will walk you though the steps to get started on end-to-end flow for LlamaStack. This guide mainly focuses on getting started with building a LlamaStack distribution, and starting up a LlamaStack server. Please see our [documentations](../README.md) on what you can do with Llama Stack, and [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) on examples apps built with Llama Stack.
## Installation ## Installation
The `llama` CLI tool helps you setup and use the Llama toolchain & agentic systems. It should be available on your path after installing the `llama-stack` package.
You can install this repository as a [package](https://pypi.org/project/llama-stack/) with `pip install llama-stack` You can install this repository as a [package](https://pypi.org/project/llama-stack/) with `pip install llama-stack`
@ -57,26 +21,39 @@ cd llama-stack
$CONDA_PREFIX/bin/pip install -e . $CONDA_PREFIX/bin/pip install -e .
``` ```
# Getting Started For what you can do with the Llama CLI, please refer to [CLI Reference](./cli_reference.md).
The `llama` CLI tool helps you setup and use the Llama toolchain & agentic systems. It should be available on your path after installing the `llama-stack` package. ## Starting Up Llama Stack Server
#### Starting up server via docker
This guides allows you to quickly get started with building and running a Llama Stack server in < 5 minutes! We provide 2 pre-built Docker image of Llama Stack distribution, which can be found in the following links.
- [llamastack-local-gpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-gpu/general)
- This is a packaged version with our local meta-reference implementations, where you will be running inference locally with downloaded Llama model checkpoints.
- [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general)
- This is a lite version with remote inference where you can hook up to your favourite remote inference framework (e.g. ollama, fireworks, together, tgi) for running inference without GPU.
You may also checkout this [notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for trying out out demo scripts. > [!NOTE]
> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container.
## Quick Cheatsheet
#### Via docker
``` ```
docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack-local-gpu export LLAMA_CHECKPOINT_DIR=~/.llama
``` ```
> [!NOTE] > [!NOTE]
> `~/.llama` should be the path containing downloaded weights of Llama models. > `~/.llama` should be the path containing downloaded weights of Llama models.
#### Via conda To download and start running a pre-built docker container, you may use the following commands:
```
docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack/llamastack-local-gpu
```
> [!TIP]
> Pro Tip: We may use `docker compose up` for starting up a distribution with remote providers (e.g. TGI) using [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general). You can checkout [these scripts](../distributions/) to help you get started.
#### Build->Configure->Run Llama Stack server via conda
You may also build a LlamaStack distribution from scratch, configure it, and start running the distribution. This is useful for developing on LlamaStack.
**`llama stack build`** **`llama stack build`**
- You'll be prompted to enter build information interactively. - You'll be prompted to enter build information interactively.
``` ```
@ -183,243 +160,7 @@ INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
``` ```
## Step 1. Build ## Testing with client
In the following steps, imagine we'll be working with a `Meta-Llama3.1-8B-Instruct` model. We will name our build `8b-instruct` to help us remember the config. We will start build our distribution (in the form of a Conda environment, or Docker image). In this step, we will specify:
- `name`: the name for our distribution (e.g. `8b-instruct`)
- `image_type`: our build image type (`conda | docker`)
- `distribution_spec`: our distribution specs for specifying API providers
- `description`: a short description of the configurations for the distribution
- `providers`: specifies the underlying implementation for serving each API endpoint
- `image_type`: `conda` | `docker` to specify whether to build the distribution in the form of Docker image or Conda environment.
At the end of build command, we will generate `<name>-build.yaml` file storing the build configurations.
After this step is complete, a file named `<name>-build.yaml` will be generated and saved at the output file path specified at the end of the command.
#### Building from scratch
- For a new user, we could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
```
llama stack build
```
Running the command above will allow you to fill in the configuration to build your Llama Stack distribution, you will see the following outputs.
```
> Enter an unique name for identifying your Llama Stack build distribution (e.g. my-local-stack): 8b-instruct
> Enter the image type you want your distribution to be built with (docker or conda): conda
Llama Stack is composed of several APIs working together. Let's configure the providers (implementations) you want to use for these APIs.
> Enter the API provider for the inference API: (default=meta-reference): meta-reference
> Enter the API provider for the safety API: (default=meta-reference): meta-reference
> Enter the API provider for the agents API: (default=meta-reference): meta-reference
> Enter the API provider for the memory API: (default=meta-reference): meta-reference
> Enter the API provider for the telemetry API: (default=meta-reference): meta-reference
> (Optional) Enter a short description for your Llama Stack distribution:
Build spec configuration saved at ~/.conda/envs/llamastack-my-local-llama-stack/8b-instruct-build.yaml
```
**Ollama (optional)**
If you plan to use Ollama for inference, you'll need to install the server [via these instructions](https://ollama.com/download).
#### Building from templates
- To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
The following command will allow you to see the available templates and their corresponding providers.
```
llama stack build --list-templates
```
![alt text](resources/list-templates.png)
You may then pick a template to build your distribution with providers fitted to your liking.
```
llama stack build --template local-tgi --name my-tgi-stack
```
```
$ llama stack build --template local-tgi --name my-tgi-stack
...
...
Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml`
```
#### Building from config file
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/distributions/templates/`.
```
$ cat llama_stack/distribution/templates/local-ollama-build.yaml
name: local-ollama
distribution_spec:
description: Like local, but use ollama for running LLM inference
providers:
inference: remote::ollama
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda
```
```
llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml
```
#### How to build distribution with Docker image
> [!TIP]
> Podman is supported as an alternative to Docker. Set `DOCKER_BINARY` to `podman` in your environment to use Podman.
To build a docker image, you may start off from a template and use the `--image-type docker` flag to specify `docker` as the build image type.
```
llama stack build --template local --image-type docker --name docker-0
```
Alternatively, you may use a config file and set `image_type` to `docker` in our `<name>-build.yaml` file, and run `llama stack build <name>-build.yaml`. The `<name>-build.yaml` will be of contents like:
```
name: local-docker-example
distribution_spec:
description: Use code from `llama_stack` itself to serve all llama stack APIs
docker_image: null
providers:
inference: meta-reference
memory: meta-reference-faiss
safety: meta-reference
agentic_system: meta-reference
telemetry: console
image_type: docker
```
The following command allows you to build a Docker image with the name `<name>`
```
llama stack build --config <name>-build.yaml
Dockerfile created successfully in /tmp/tmp.I0ifS2c46A/DockerfileFROM python:3.10-slim
WORKDIR /app
...
...
You can run it with: podman run -p 8000:8000 llamastack-docker-local
Build spec configuration saved at ~/.llama/distributions/docker/docker-local-build.yaml
```
## Step 2. Configure
After our distribution is built (either in form of docker or conda environment), we will run the following command to
```
llama stack configure [ <name> | <docker-image-name> | <path/to/name.build.yaml>]
```
- For `conda` environments: <path/to/name.build.yaml> would be the generated build spec saved from Step 1.
- For `docker` images downloaded from Dockerhub, you could also use <docker-image-name> as the argument.
- Run `docker images` to check list of available images on your machine.
```
$ llama stack configure 8b-instruct
Configuring API: inference (meta-reference)
Enter value for model (existing: Meta-Llama3.1-8B-Instruct) (required):
Enter value for quantization (optional):
Enter value for torch_seed (optional):
Enter value for max_seq_len (existing: 4096) (required):
Enter value for max_batch_size (existing: 1) (required):
Configuring API: memory (meta-reference-faiss)
Configuring API: safety (meta-reference)
Do you want to configure llama_guard_shield? (y/n): y
Entering sub-configuration for llama_guard_shield:
Enter value for model (default: Llama-Guard-3-1B) (required):
Enter value for excluded_categories (default: []) (required):
Enter value for disable_input_check (default: False) (required):
Enter value for disable_output_check (default: False) (required):
Do you want to configure prompt_guard_shield? (y/n): y
Entering sub-configuration for prompt_guard_shield:
Enter value for model (default: Prompt-Guard-86M) (required):
Configuring API: agentic_system (meta-reference)
Enter value for brave_search_api_key (optional):
Enter value for bing_search_api_key (optional):
Enter value for wolfram_api_key (optional):
Configuring API: telemetry (console)
YAML configuration has been written to ~/.llama/builds/conda/8b-instruct-run.yaml
```
After this step is successful, you should be able to find a run configuration spec in `~/.llama/builds/conda/8b-instruct-run.yaml` with the following contents. You may edit this file to change the settings.
As you can see, we did basic configuration above and configured:
- inference to run on model `Meta-Llama3.1-8B-Instruct` (obtained from `llama model list`)
- Llama Guard safety shield with model `Llama-Guard-3-1B`
- Prompt Guard safety shield with model `Prompt-Guard-86M`
For how these configurations are stored as yaml, checkout the file printed at the end of the configuration.
Note that all configurations as well as models are stored in `~/.llama`
## Step 3. Run
Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack configure` step.
```
llama stack run 8b-instruct
```
You should see the Llama Stack server start and print the APIs that it is supporting
```
$ llama stack run 8b-instruct
> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1
Loaded in 19.28 seconds
NCCL version 2.20.5+cuda12.4
Finished model load YES READY
Serving POST /inference/batch_chat_completion
Serving POST /inference/batch_completion
Serving POST /inference/chat_completion
Serving POST /inference/completion
Serving POST /safety/run_shield
Serving POST /agentic_system/memory_bank/attach
Serving POST /agentic_system/create
Serving POST /agentic_system/session/create
Serving POST /agentic_system/turn/create
Serving POST /agentic_system/delete
Serving POST /agentic_system/session/delete
Serving POST /agentic_system/memory_bank/detach
Serving POST /agentic_system/session/get
Serving POST /agentic_system/step/get
Serving POST /agentic_system/turn/get
Listening on :::5000
INFO: Started server process [453333]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
```
> [!NOTE]
> Configuration is in `~/.llama/builds/local/conda/8b-instruct-run.yaml`. Feel free to increase `max_seq_len`.
> [!IMPORTANT]
> The "local" distribution inference server currently only supports CUDA. It will not work on Apple Silicon machines.
> [!TIP]
> You might need to use the flag `--disable-ipv6` to Disable IPv6 support
This server is running a Llama model locally.
## Step 4. Test with Client
Once the server is setup, we can test it with a client to see the example outputs. Once the server is setup, we can test it with a client to see the example outputs.
``` ```
cd /path/to/llama-stack cd /path/to/llama-stack
@ -445,4 +186,11 @@ Similarly you can test safety (if you configured llama-guard and/or prompt-guard
python -m llama_stack.apis.safety.client localhost 5000 python -m llama_stack.apis.safety.client localhost 5000
``` ```
Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo. You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.
## Advanced Guides
Please see our [Building a LLama Stack Distribution](./building_distro.md) guide for more details on how to assemble your own Llama Stack Distribution.

35
docs/make.bat Normal file
View file

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
if "%1" == "" goto help
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

26
docs/new_api_provider.md Normal file
View file

@ -0,0 +1,26 @@
# Developer Guide: Adding a New API Provider
This guide contains references to walk you through adding a new API provider.
### Adding a new API provider
1. First, decide which API your provider falls into (e.g. Inference, Safety, Agents, Memory).
2. Decide whether your provider is a remote provider, or inline implmentation. A remote provider is a provider that makes a remote request to an service. An inline provider is a provider where implementation is executed locally. Checkout the examples, and follow the structure to add your own API provider. Please find the following code pointers:
- [Inference Remote Adapter](../llama_stack/providers/adapters/inference/)
- [Inference Inline Provider](../llama_stack/providers/impls/)
3. [Build a Llama Stack distribution](./building_distro.md) with your API provider.
4. Test your code!
### Testing your newly added API providers
1. Start with an _integration test_ for your provider. That means we will instantiate the real provider, pass it real configuration and if it is a remote service, we will actually hit the remote service. We **strongly** discourage mocking for these tests at the provider level. Llama Stack is first and foremost about integration so we need to make sure stuff works end-to-end. See [llama_stack/providers/tests/inference/test_inference.py](../llama_stack/providers/tests/inference/test_inference.py) for an example.
2. In addition, if you want to unit test functionality within your provider, feel free to do so. You can find some tests in `tests/` but they aren't well supported so far.
3. Test with a client-server Llama Stack setup. (a) Start a Llama Stack server with your own distribution which includes the new provider. (b) Send a client request to the server. See `llama_stack/apis/<api>/client.py` for how this is done. These client scripts can serve as lightweight tests.
You can find more complex client scripts [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repo. Note down which scripts works and do not work with your distribution.
### Submit your PR
After you have fully tested your newly added API provider, submit a PR with the attached test plan. You must have a Test Plan in the summary section of your PR.

3
docs/requirements.txt Normal file
View file

@ -0,0 +1,3 @@
sphinx
myst-parser
linkify

View file

@ -21,7 +21,7 @@
"info": { "info": {
"title": "[DRAFT] Llama Stack Specification", "title": "[DRAFT] Llama Stack Specification",
"version": "0.0.1", "version": "0.0.1",
"description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-10 15:29:56.831109" "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-10-18 20:48:17.730988"
}, },
"servers": [ "servers": [
{ {
@ -2830,8 +2830,11 @@
"CompletionResponse": { "CompletionResponse": {
"type": "object", "type": "object",
"properties": { "properties": {
"completion_message": { "content": {
"$ref": "#/components/schemas/CompletionMessage" "type": "string"
},
"stop_reason": {
"$ref": "#/components/schemas/StopReason"
}, },
"logprobs": { "logprobs": {
"type": "array", "type": "array",
@ -2842,7 +2845,8 @@
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"completion_message" "content",
"stop_reason"
], ],
"title": "Completion response." "title": "Completion response."
}, },
@ -6075,49 +6079,49 @@
], ],
"tags": [ "tags": [
{ {
"name": "Evaluations" "name": "Models"
},
{
"name": "Inspect"
}, },
{ {
"name": "RewardScoring" "name": "RewardScoring"
}, },
{ {
"name": "Datasets" "name": "MemoryBanks"
},
{
"name": "Models"
},
{
"name": "Telemetry"
},
{
"name": "PostTraining"
},
{
"name": "SyntheticDataGeneration"
},
{
"name": "BatchInference"
},
{
"name": "Inference"
},
{
"name": "Agents"
},
{
"name": "Memory"
},
{
"name": "Safety"
}, },
{ {
"name": "Shields" "name": "Shields"
}, },
{ {
"name": "MemoryBanks" "name": "SyntheticDataGeneration"
},
{
"name": "Inference"
},
{
"name": "Inspect"
},
{
"name": "BatchInference"
},
{
"name": "Memory"
},
{
"name": "Datasets"
},
{
"name": "Agents"
},
{
"name": "PostTraining"
},
{
"name": "Telemetry"
},
{
"name": "Safety"
},
{
"name": "Evaluations"
}, },
{ {
"name": "BuiltinTool", "name": "BuiltinTool",

View file

@ -501,14 +501,17 @@ components:
CompletionResponse: CompletionResponse:
additionalProperties: false additionalProperties: false
properties: properties:
completion_message: content:
$ref: '#/components/schemas/CompletionMessage' type: string
logprobs: logprobs:
items: items:
$ref: '#/components/schemas/TokenLogProbs' $ref: '#/components/schemas/TokenLogProbs'
type: array type: array
stop_reason:
$ref: '#/components/schemas/StopReason'
required: required:
- completion_message - content
- stop_reason
title: Completion response. title: Completion response.
type: object type: object
CompletionResponseStreamChunk: CompletionResponseStreamChunk:
@ -2507,7 +2510,7 @@ info:
description: "This is the specification of the llama stack that provides\n \ description: "This is the specification of the llama stack that provides\n \
\ a set of endpoints and their corresponding interfaces that are tailored\ \ a set of endpoints and their corresponding interfaces that are tailored\
\ to\n best leverage Llama Models. The specification is still in\ \ to\n best leverage Llama Models. The specification is still in\
\ draft and subject to change.\n Generated at 2024-10-10 15:29:56.831109" \ draft and subject to change.\n Generated at 2024-10-18 20:48:17.730988"
title: '[DRAFT] Llama Stack Specification' title: '[DRAFT] Llama Stack Specification'
version: 0.0.1 version: 0.0.1
jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@ -3712,21 +3715,21 @@ security:
servers: servers:
- url: http://any-hosted-llama-stack.com - url: http://any-hosted-llama-stack.com
tags: tags:
- name: Evaluations
- name: Inspect
- name: RewardScoring
- name: Datasets
- name: Models - name: Models
- name: Telemetry - name: RewardScoring
- name: PostTraining
- name: SyntheticDataGeneration
- name: BatchInference
- name: Inference
- name: Agents
- name: Memory
- name: Safety
- name: Shields
- name: MemoryBanks - name: MemoryBanks
- name: Shields
- name: SyntheticDataGeneration
- name: Inference
- name: Inspect
- name: BatchInference
- name: Memory
- name: Datasets
- name: Agents
- name: PostTraining
- name: Telemetry
- name: Safety
- name: Evaluations
- description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" /> - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
name: BuiltinTool name: BuiltinTool
- description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage" - description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage"

View file

@ -0,0 +1,485 @@
# Llama CLI Reference
The `llama` CLI tool helps you setup and use the Llama Stack & agentic systems. It should be available on your path after installing the `llama-stack` package.
## Subcommands
1. `download`: `llama` cli tools supports downloading the model from Meta or Hugging Face.
2. `model`: Lists available models and their properties.
3. `stack`: Allows you to build and run a Llama Stack server. You can read more about this in Step 3 below.
## Sample Usage
```
llama --help
```
<pre style="font-family: monospace;">
usage: llama [-h] {download,model,stack} ...
Welcome to the Llama CLI
options:
-h, --help show this help message and exit
subcommands:
{download,model,stack}
</pre>
## Step 1. Get the models
You first need to have models downloaded locally.
To download any model you need the **Model Descriptor**.
This can be obtained by running the command
```
llama model list
```
You should see a table like this:
<pre style="font-family: monospace;">
+----------------------------------+------------------------------------------+----------------+
| Model Descriptor | Hugging Face Repo | Context Length |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-8B | meta-llama/Llama-3.1-8B | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-70B | meta-llama/Llama-3.1-70B | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-405B:bf16-mp8 | meta-llama/Llama-3.1-405B | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-405B | meta-llama/Llama-3.1-405B-FP8 | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-405B:bf16-mp16 | meta-llama/Llama-3.1-405B | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-8B-Instruct | meta-llama/Llama-3.1-8B-Instruct | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-70B-Instruct | meta-llama/Llama-3.1-70B-Instruct | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-405B-Instruct:bf16-mp8 | meta-llama/Llama-3.1-405B-Instruct | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-405B-Instruct | meta-llama/Llama-3.1-405B-Instruct-FP8 | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.1-405B-Instruct:bf16-mp16 | meta-llama/Llama-3.1-405B-Instruct | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.2-1B | meta-llama/Llama-3.2-1B | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.2-3B | meta-llama/Llama-3.2-3B | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.2-11B-Vision | meta-llama/Llama-3.2-11B-Vision | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.2-90B-Vision | meta-llama/Llama-3.2-90B-Vision | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.2-1B-Instruct | meta-llama/Llama-3.2-1B-Instruct | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.2-3B-Instruct | meta-llama/Llama-3.2-3B-Instruct | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.2-11B-Vision-Instruct | meta-llama/Llama-3.2-11B-Vision-Instruct | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama3.2-90B-Vision-Instruct | meta-llama/Llama-3.2-90B-Vision-Instruct | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama-Guard-3-11B-Vision | meta-llama/Llama-Guard-3-11B-Vision | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama-Guard-3-1B:int4-mp1 | meta-llama/Llama-Guard-3-1B-INT4 | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama-Guard-3-1B | meta-llama/Llama-Guard-3-1B | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama-Guard-3-8B | meta-llama/Llama-Guard-3-8B | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama-Guard-3-8B:int8-mp1 | meta-llama/Llama-Guard-3-8B-INT8 | 128K |
+----------------------------------+------------------------------------------+----------------+
| Prompt-Guard-86M | meta-llama/Prompt-Guard-86M | 128K |
+----------------------------------+------------------------------------------+----------------+
| Llama-Guard-2-8B | meta-llama/Llama-Guard-2-8B | 4K |
+----------------------------------+------------------------------------------+----------------+
</pre>
To download models, you can use the llama download command.
### Downloading from [Meta](https://llama.meta.com/llama-downloads/)
Here is an example download command to get the 3B-Instruct/11B-Vision-Instruct model. You will need META_URL which can be obtained from [here](https://llama.meta.com/docs/getting_the_models/meta/)
Download the required checkpoints using the following commands:
```bash
# download the 8B model, this can be run on a single GPU
llama download --source meta --model-id Llama3.2-3B-Instruct --meta-url META_URL
# you can also get the 70B model, this will require 8 GPUs however
llama download --source meta --model-id Llama3.2-11B-Vision-Instruct --meta-url META_URL
# llama-agents have safety enabled by default. For this, you will need
# safety models -- Llama-Guard and Prompt-Guard
llama download --source meta --model-id Prompt-Guard-86M --meta-url META_URL
llama download --source meta --model-id Llama-Guard-3-1B --meta-url META_URL
```
### Downloading from [Hugging Face](https://huggingface.co/meta-llama)
Essentially, the same commands above work, just replace `--source meta` with `--source huggingface`.
```bash
llama download --source huggingface --model-id Llama3.1-8B-Instruct --hf-token <HF_TOKEN>
llama download --source huggingface --model-id Llama3.1-70B-Instruct --hf-token <HF_TOKEN>
llama download --source huggingface --model-id Llama-Guard-3-1B --ignore-patterns *original*
llama download --source huggingface --model-id Prompt-Guard-86M --ignore-patterns *original*
```
**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
### Downloading via Ollama
If you're already using ollama, we also have a supported Llama Stack distribution `local-ollama` and you can continue to use ollama for managing model downloads.
```
ollama pull llama3.1:8b-instruct-fp16
ollama pull llama3.1:70b-instruct-fp16
```
> [!NOTE]
> Only the above two models are currently supported by Ollama.
## Step 2: Understand the models
The `llama model` command helps you explore the models interface.
### 2.1 Subcommands
1. `download`: Download the model from different sources. (meta, huggingface)
2. `list`: Lists all the models available for download with hardware requirements to deploy the models.
3. `prompt-format`: Show llama model message formats.
4. `describe`: Describes all the properties of the model.
### 2.2 Sample Usage
`llama model <subcommand> <options>`
```
llama model --help
```
<pre style="font-family: monospace;">
usage: llama model [-h] {download,list,prompt-format,describe} ...
Work with llama models
options:
-h, --help show this help message and exit
model_subcommands:
{download,list,prompt-format,describe}
</pre>
You can use the describe command to know more about a model:
```
llama model describe -m Llama3.2-3B-Instruct
```
### 2.3 Describe
<pre style="font-family: monospace;">
+-----------------------------+----------------------------------+
| Model | Llama3.2-3B-Instruct |
+-----------------------------+----------------------------------+
| Hugging Face ID | meta-llama/Llama-3.2-3B-Instruct |
+-----------------------------+----------------------------------+
| Description | Llama 3.2 3b instruct model |
+-----------------------------+----------------------------------+
| Context Length | 128K tokens |
+-----------------------------+----------------------------------+
| Weights format | bf16 |
+-----------------------------+----------------------------------+
| Model params.json | { |
| | "dim": 3072, |
| | "n_layers": 28, |
| | "n_heads": 24, |
| | "n_kv_heads": 8, |
| | "vocab_size": 128256, |
| | "ffn_dim_multiplier": 1.0, |
| | "multiple_of": 256, |
| | "norm_eps": 1e-05, |
| | "rope_theta": 500000.0, |
| | "use_scaled_rope": true |
| | } |
+-----------------------------+----------------------------------+
| Recommended sampling params | { |
| | "strategy": "top_p", |
| | "temperature": 1.0, |
| | "top_p": 0.9, |
| | "top_k": 0 |
| | } |
+-----------------------------+----------------------------------+
</pre>
### 2.4 Prompt Format
You can even run `llama model prompt-format` see all of the templates and their tokens:
```
llama model prompt-format -m Llama3.2-3B-Instruct
```
![alt text](https://github.com/meta-llama/llama-stack/docs/resources/prompt-format.png)
You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.
**NOTE**: Outputs in terminal are color printed to show special tokens.
## Step 3: Building, and Configuring Llama Stack Distributions
- Please see our [Getting Started](getting_started.md) guide for more details on how to build and start a Llama Stack distribution.
### Step 3.1 Build
In the following steps, imagine we'll be working with a `Llama3.1-8B-Instruct` model. We will name our build `8b-instruct` to help us remember the config. We will start build our distribution (in the form of a Conda environment, or Docker image). In this step, we will specify:
- `name`: the name for our distribution (e.g. `8b-instruct`)
- `image_type`: our build image type (`conda | docker`)
- `distribution_spec`: our distribution specs for specifying API providers
- `description`: a short description of the configurations for the distribution
- `providers`: specifies the underlying implementation for serving each API endpoint
- `image_type`: `conda` | `docker` to specify whether to build the distribution in the form of Docker image or Conda environment.
At the end of build command, we will generate `<name>-build.yaml` file storing the build configurations.
After this step is complete, a file named `<name>-build.yaml` will be generated and saved at the output file path specified at the end of the command.
#### Building from scratch
- For a new user, we could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
```
llama stack build
```
Running the command above will allow you to fill in the configuration to build your Llama Stack distribution, you will see the following outputs.
```
> Enter an unique name for identifying your Llama Stack build distribution (e.g. my-local-stack): my-local-llama-stack
> Enter the image type you want your distribution to be built with (docker or conda): conda
Llama Stack is composed of several APIs working together. Let's configure the providers (implementations) you want to use for these APIs.
> Enter the API provider for the inference API: (default=meta-reference): meta-reference
> Enter the API provider for the safety API: (default=meta-reference): meta-reference
> Enter the API provider for the agents API: (default=meta-reference): meta-reference
> Enter the API provider for the memory API: (default=meta-reference): meta-reference
> Enter the API provider for the telemetry API: (default=meta-reference): meta-reference
> (Optional) Enter a short description for your Llama Stack distribution:
Build spec configuration saved at ~/.conda/envs/llamastack-my-local-llama-stack/my-local-llama-stack-build.yaml
```
#### Building from templates
- To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
The following command will allow you to see the available templates and their corresponding providers.
```
llama stack build --list-templates
```
![alt text](https://github.com/meta-llama/llama-stack/docs/resources/list-templates.png)
You may then pick a template to build your distribution with providers fitted to your liking.
```
llama stack build --template local-tgi --name my-tgi-stack
```
```
$ llama stack build --template local-tgi --name my-tgi-stack
...
...
Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml`
```
#### Building from config file
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/distributions/templates/`.
```
$ cat llama_stack/distribution/templates/local-ollama-build.yaml
name: local-ollama
distribution_spec:
description: Like local, but use ollama for running LLM inference
providers:
inference: remote::ollama
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda
```
```
llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml
```
#### How to build distribution with Docker image
To build a docker image, you may start off from a template and use the `--image-type docker` flag to specify `docker` as the build image type.
```
llama stack build --template local --image-type docker --name docker-0
```
Alternatively, you may use a config file and set `image_type` to `docker` in our `<name>-build.yaml` file, and run `llama stack build <name>-build.yaml`. The `<name>-build.yaml` will be of contents like:
```
name: local-docker-example
distribution_spec:
description: Use code from `llama_stack` itself to serve all llama stack APIs
docker_image: null
providers:
inference: meta-reference
memory: meta-reference-faiss
safety: meta-reference
agentic_system: meta-reference
telemetry: console
image_type: docker
```
The following command allows you to build a Docker image with the name `<name>`
```
llama stack build --config <name>-build.yaml
Dockerfile created successfully in /tmp/tmp.I0ifS2c46A/DockerfileFROM python:3.10-slim
WORKDIR /app
...
...
You can run it with: podman run -p 8000:8000 llamastack-docker-local
Build spec configuration saved at ~/.llama/distributions/docker/docker-local-build.yaml
```
### Step 3.2 Configure
After our distribution is built (either in form of docker or conda environment), we will run the following command to
```
llama stack configure [ <name> | <docker-image-name> | <path/to/name.build.yaml>]
```
- For `conda` environments: <path/to/name.build.yaml> would be the generated build spec saved from Step 1.
- For `docker` images downloaded from Dockerhub, you could also use <docker-image-name> as the argument.
- Run `docker images` to check list of available images on your machine.
```
$ llama stack configure ~/.llama/distributions/conda/8b-instruct-build.yaml
Configuring API: inference (meta-reference)
Enter value for model (existing: Llama3.1-8B-Instruct) (required):
Enter value for quantization (optional):
Enter value for torch_seed (optional):
Enter value for max_seq_len (existing: 4096) (required):
Enter value for max_batch_size (existing: 1) (required):
Configuring API: memory (meta-reference-faiss)
Configuring API: safety (meta-reference)
Do you want to configure llama_guard_shield? (y/n): y
Entering sub-configuration for llama_guard_shield:
Enter value for model (default: Llama-Guard-3-1B) (required):
Enter value for excluded_categories (default: []) (required):
Enter value for disable_input_check (default: False) (required):
Enter value for disable_output_check (default: False) (required):
Do you want to configure prompt_guard_shield? (y/n): y
Entering sub-configuration for prompt_guard_shield:
Enter value for model (default: Prompt-Guard-86M) (required):
Configuring API: agentic_system (meta-reference)
Enter value for brave_search_api_key (optional):
Enter value for bing_search_api_key (optional):
Enter value for wolfram_api_key (optional):
Configuring API: telemetry (console)
YAML configuration has been written to ~/.llama/builds/conda/8b-instruct-run.yaml
```
After this step is successful, you should be able to find a run configuration spec in `~/.llama/builds/conda/8b-instruct-run.yaml` with the following contents. You may edit this file to change the settings.
As you can see, we did basic configuration above and configured:
- inference to run on model `Llama3.1-8B-Instruct` (obtained from `llama model list`)
- Llama Guard safety shield with model `Llama-Guard-3-1B`
- Prompt Guard safety shield with model `Prompt-Guard-86M`
For how these configurations are stored as yaml, checkout the file printed at the end of the configuration.
Note that all configurations as well as models are stored in `~/.llama`
### Step 3.3 Run
Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack configure` step.
```
llama stack run ~/.llama/builds/conda/8b-instruct-run.yaml
```
You should see the Llama Stack server start and print the APIs that it is supporting
```
$ llama stack run ~/.llama/builds/local/conda/8b-instruct.yaml
> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1
Loaded in 19.28 seconds
NCCL version 2.20.5+cuda12.4
Finished model load YES READY
Serving POST /inference/batch_chat_completion
Serving POST /inference/batch_completion
Serving POST /inference/chat_completion
Serving POST /inference/completion
Serving POST /safety/run_shield
Serving POST /agentic_system/memory_bank/attach
Serving POST /agentic_system/create
Serving POST /agentic_system/session/create
Serving POST /agentic_system/turn/create
Serving POST /agentic_system/delete
Serving POST /agentic_system/session/delete
Serving POST /agentic_system/memory_bank/detach
Serving POST /agentic_system/session/get
Serving POST /agentic_system/step/get
Serving POST /agentic_system/turn/get
Listening on :::5000
INFO: Started server process [453333]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
```
> [!NOTE]
> Configuration is in `~/.llama/builds/local/conda/8b-instruct-run.yaml`. Feel free to increase `max_seq_len`.
> [!IMPORTANT]
> The "local" distribution inference server currently only supports CUDA. It will not work on Apple Silicon machines.
> [!TIP]
> You might need to use the flag `--disable-ipv6` to Disable IPv6 support
This server is running a Llama model locally.
### Step 3.4 Test with Client
Once the server is setup, we can test it with a client to see the example outputs.
```
cd /path/to/llama-stack
conda activate <env> # any environment containing the llama-stack pip package will work
python -m llama_stack.apis.inference.client localhost 5000
```
This will run the chat completion client and query the distributions /inference/chat_completion API.
Here is an example output:
```
User>hello world, write me a 2 sentence poem about the moon
Assistant> Here's a 2-sentence poem about the moon:
The moon glows softly in the midnight sky,
A beacon of wonder, as it passes by.
```
Similarly you can test safety (if you configured llama-guard and/or prompt-guard shields) by:
```
python -m llama_stack.apis.safety.client localhost 5000
```
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.

53
docs/source/conf.py Normal file
View file

@ -0,0 +1,53 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = "llama-stack"
copyright = "2024, Meta"
author = "Meta"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
extensions = ["myst_parser"]
templates_path = ["_templates"]
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
myst_enable_extensions = [
"amsmath",
"attrs_inline",
"colon_fence",
"deflist",
"dollarmath",
"fieldlist",
"html_admonition",
"html_image",
# "linkify",
"replacements",
"smartquotes",
"strikethrough",
"substitution",
"tasklist",
]
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
html_theme = "alabaster"
html_theme_options = {
"canonical_url": "https://github.com/meta-llama/llama-stack",
}
html_static_path = ["../_static"]
html_logo = "../_static/llama-stack-logo.png"

View file

@ -0,0 +1,430 @@
# Getting Started
This guide will walk you though the steps to get started on end-to-end flow for LlamaStack. This guide mainly focuses on getting started with building a LlamaStack distribution, and starting up a LlamaStack server. Please see our [documentations](https://github.com/meta-llama/llama-stack/README.md) on what you can do with Llama Stack, and [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) on examples apps built with Llama Stack.
## Installation
The `llama` CLI tool helps you setup and use the Llama toolchain & agentic systems. It should be available on your path after installing the `llama-stack` package.
You can install this repository as a [package](https://pypi.org/project/llama-stack/) with `pip install llama-stack`
If you want to install from source:
```bash
mkdir -p ~/local
cd ~/local
git clone git@github.com:meta-llama/llama-stack.git
conda create -n stack python=3.10
conda activate stack
cd llama-stack
$CONDA_PREFIX/bin/pip install -e .
```
For what you can do with the Llama CLI, please refer to [CLI Reference](./cli_reference.md).
## Quick Starting Llama Stack Server
### Starting up server via docker
We provide 2 pre-built Docker image of Llama Stack distribution, which can be found in the following links.
- [llamastack-local-gpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-gpu/general)
- This is a packaged version with our local meta-reference implementations, where you will be running inference locally with downloaded Llama model checkpoints.
- [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general)
- This is a lite version with remote inference where you can hook up to your favourite remote inference framework (e.g. ollama, fireworks, together, tgi) for running inference without GPU.
> [!NOTE]
> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container.
```
export LLAMA_CHECKPOINT_DIR=~/.llama
```
> [!NOTE]
> `~/.llama` should be the path containing downloaded weights of Llama models.
To download and start running a pre-built docker container, you may use the following commands:
```
docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack/llamastack-local-gpu
```
> [!TIP]
> Pro Tip: We may use `docker compose up` for starting up a distribution with remote providers (e.g. TGI) using [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general). You can checkout [these scripts](https://github.com/meta-llama/llama-stack/llama_stack/distribution/docker/README.md) to help you get started.
### Build->Configure->Run Llama Stack server via conda
You may also build a LlamaStack distribution from scratch, configure it, and start running the distribution. This is useful for developing on LlamaStack.
**`llama stack build`**
- You'll be prompted to enter build information interactively.
```
llama stack build
> Enter an unique name for identifying your Llama Stack build distribution (e.g. my-local-stack): my-local-stack
> Enter the image type you want your distribution to be built with (docker or conda): conda
Llama Stack is composed of several APIs working together. Let's configure the providers (implementations) you want to use for these APIs.
> Enter the API provider for the inference API: (default=meta-reference): meta-reference
> Enter the API provider for the safety API: (default=meta-reference): meta-reference
> Enter the API provider for the agents API: (default=meta-reference): meta-reference
> Enter the API provider for the memory API: (default=meta-reference): meta-reference
> Enter the API provider for the telemetry API: (default=meta-reference): meta-reference
> (Optional) Enter a short description for your Llama Stack distribution:
Build spec configuration saved at ~/.conda/envs/llamastack-my-local-stack/my-local-stack-build.yaml
You can now run `llama stack configure my-local-stack`
```
**`llama stack configure`**
- Run `llama stack configure <name>` with the name you have previously defined in `build` step.
```
llama stack configure <name>
```
- You will be prompted to enter configurations for your Llama Stack
```
$ llama stack configure my-local-stack
Could not find my-local-stack. Trying conda build name instead...
Configuring API `inference`...
=== Configuring provider `meta-reference` for API inference...
Enter value for model (default: Llama3.1-8B-Instruct) (required):
Do you want to configure quantization? (y/n): n
Enter value for torch_seed (optional):
Enter value for max_seq_len (default: 4096) (required):
Enter value for max_batch_size (default: 1) (required):
Configuring API `safety`...
=== Configuring provider `meta-reference` for API safety...
Do you want to configure llama_guard_shield? (y/n): n
Do you want to configure prompt_guard_shield? (y/n): n
Configuring API `agents`...
=== Configuring provider `meta-reference` for API agents...
Enter `type` for persistence_store (options: redis, sqlite, postgres) (default: sqlite):
Configuring SqliteKVStoreConfig:
Enter value for namespace (optional):
Enter value for db_path (default: /home/xiyan/.llama/runtime/kvstore.db) (required):
Configuring API `memory`...
=== Configuring provider `meta-reference` for API memory...
> Please enter the supported memory bank type your provider has for memory: vector
Configuring API `telemetry`...
=== Configuring provider `meta-reference` for API telemetry...
> YAML configuration has been written to ~/.llama/builds/conda/my-local-stack-run.yaml.
You can now run `llama stack run my-local-stack --port PORT`
```
**`llama stack run`**
- Run `llama stack run <name>` with the name you have previously defined.
```
llama stack run my-local-stack
...
> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1
...
Finished model load YES READY
Serving POST /inference/chat_completion
Serving POST /inference/completion
Serving POST /inference/embeddings
Serving POST /memory_banks/create
Serving DELETE /memory_bank/documents/delete
Serving DELETE /memory_banks/drop
Serving GET /memory_bank/documents/get
Serving GET /memory_banks/get
Serving POST /memory_bank/insert
Serving GET /memory_banks/list
Serving POST /memory_bank/query
Serving POST /memory_bank/update
Serving POST /safety/run_shield
Serving POST /agentic_system/create
Serving POST /agentic_system/session/create
Serving POST /agentic_system/turn/create
Serving POST /agentic_system/delete
Serving POST /agentic_system/session/delete
Serving POST /agentic_system/session/get
Serving POST /agentic_system/step/get
Serving POST /agentic_system/turn/get
Serving GET /telemetry/get_trace
Serving POST /telemetry/log_event
Listening on :::5000
INFO: Started server process [587053]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
```
### End-to-end flow of building, configuring, running, and testing a Distribution
#### Step 1. Build
In the following steps, imagine we'll be working with a `Meta-Llama3.1-8B-Instruct` model. We will name our build `8b-instruct` to help us remember the config. We will start build our distribution (in the form of a Conda environment, or Docker image). In this step, we will specify:
- `name`: the name for our distribution (e.g. `8b-instruct`)
- `image_type`: our build image type (`conda | docker`)
- `distribution_spec`: our distribution specs for specifying API providers
- `description`: a short description of the configurations for the distribution
- `providers`: specifies the underlying implementation for serving each API endpoint
- `image_type`: `conda` | `docker` to specify whether to build the distribution in the form of Docker image or Conda environment.
At the end of build command, we will generate `<name>-build.yaml` file storing the build configurations.
After this step is complete, a file named `<name>-build.yaml` will be generated and saved at the output file path specified at the end of the command.
#### Building from scratch
- For a new user, we could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
```
llama stack build
```
Running the command above will allow you to fill in the configuration to build your Llama Stack distribution, you will see the following outputs.
```
> Enter an unique name for identifying your Llama Stack build distribution (e.g. my-local-stack): 8b-instruct
> Enter the image type you want your distribution to be built with (docker or conda): conda
Llama Stack is composed of several APIs working together. Let's configure the providers (implementations) you want to use for these APIs.
> Enter the API provider for the inference API: (default=meta-reference): meta-reference
> Enter the API provider for the safety API: (default=meta-reference): meta-reference
> Enter the API provider for the agents API: (default=meta-reference): meta-reference
> Enter the API provider for the memory API: (default=meta-reference): meta-reference
> Enter the API provider for the telemetry API: (default=meta-reference): meta-reference
> (Optional) Enter a short description for your Llama Stack distribution:
Build spec configuration saved at ~/.conda/envs/llamastack-my-local-llama-stack/8b-instruct-build.yaml
```
**Ollama (optional)**
If you plan to use Ollama for inference, you'll need to install the server [via these instructions](https://ollama.com/download).
#### Building from templates
- To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
The following command will allow you to see the available templates and their corresponding providers.
```
llama stack build --list-templates
```
![alt text](https://github.com/meta-llama/llama-stack/docs/resources/list-templates.png)
You may then pick a template to build your distribution with providers fitted to your liking.
```
llama stack build --template local-tgi --name my-tgi-stack
```
```
$ llama stack build --template local-tgi --name my-tgi-stack
...
...
Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml`
```
#### Building from config file
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/distributions/templates/`.
```
$ cat llama_stack/distribution/templates/local-ollama-build.yaml
name: local-ollama
distribution_spec:
description: Like local, but use ollama for running LLM inference
providers:
inference: remote::ollama
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda
```
```
llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml
```
#### How to build distribution with Docker image
> [!TIP]
> Podman is supported as an alternative to Docker. Set `DOCKER_BINARY` to `podman` in your environment to use Podman.
To build a docker image, you may start off from a template and use the `--image-type docker` flag to specify `docker` as the build image type.
```
llama stack build --template local --image-type docker --name docker-0
```
Alternatively, you may use a config file and set `image_type` to `docker` in our `<name>-build.yaml` file, and run `llama stack build <name>-build.yaml`. The `<name>-build.yaml` will be of contents like:
```
name: local-docker-example
distribution_spec:
description: Use code from `llama_stack` itself to serve all llama stack APIs
docker_image: null
providers:
inference: meta-reference
memory: meta-reference-faiss
safety: meta-reference
agentic_system: meta-reference
telemetry: console
image_type: docker
```
The following command allows you to build a Docker image with the name `<name>`
```
llama stack build --config <name>-build.yaml
Dockerfile created successfully in /tmp/tmp.I0ifS2c46A/DockerfileFROM python:3.10-slim
WORKDIR /app
...
...
You can run it with: podman run -p 8000:8000 llamastack-docker-local
Build spec configuration saved at ~/.llama/distributions/docker/docker-local-build.yaml
```
### Step 2. Configure
After our distribution is built (either in form of docker or conda environment), we will run the following command to
```
llama stack configure [ <name> | <docker-image-name> | <path/to/name.build.yaml>]
```
- For `conda` environments: <path/to/name.build.yaml> would be the generated build spec saved from Step 1.
- For `docker` images downloaded from Dockerhub, you could also use <docker-image-name> as the argument.
- Run `docker images` to check list of available images on your machine.
```
$ llama stack configure 8b-instruct
Configuring API: inference (meta-reference)
Enter value for model (existing: Meta-Llama3.1-8B-Instruct) (required):
Enter value for quantization (optional):
Enter value for torch_seed (optional):
Enter value for max_seq_len (existing: 4096) (required):
Enter value for max_batch_size (existing: 1) (required):
Configuring API: memory (meta-reference-faiss)
Configuring API: safety (meta-reference)
Do you want to configure llama_guard_shield? (y/n): y
Entering sub-configuration for llama_guard_shield:
Enter value for model (default: Llama-Guard-3-1B) (required):
Enter value for excluded_categories (default: []) (required):
Enter value for disable_input_check (default: False) (required):
Enter value for disable_output_check (default: False) (required):
Do you want to configure prompt_guard_shield? (y/n): y
Entering sub-configuration for prompt_guard_shield:
Enter value for model (default: Prompt-Guard-86M) (required):
Configuring API: agentic_system (meta-reference)
Enter value for brave_search_api_key (optional):
Enter value for bing_search_api_key (optional):
Enter value for wolfram_api_key (optional):
Configuring API: telemetry (console)
YAML configuration has been written to ~/.llama/builds/conda/8b-instruct-run.yaml
```
After this step is successful, you should be able to find a run configuration spec in `~/.llama/builds/conda/8b-instruct-run.yaml` with the following contents. You may edit this file to change the settings.
As you can see, we did basic configuration above and configured:
- inference to run on model `Meta-Llama3.1-8B-Instruct` (obtained from `llama model list`)
- Llama Guard safety shield with model `Llama-Guard-3-1B`
- Prompt Guard safety shield with model `Prompt-Guard-86M`
For how these configurations are stored as yaml, checkout the file printed at the end of the configuration.
Note that all configurations as well as models are stored in `~/.llama`
### Step 3. Run
Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack configure` step.
```
llama stack run 8b-instruct
```
You should see the Llama Stack server start and print the APIs that it is supporting
```
$ llama stack run 8b-instruct
> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1
Loaded in 19.28 seconds
NCCL version 2.20.5+cuda12.4
Finished model load YES READY
Serving POST /inference/batch_chat_completion
Serving POST /inference/batch_completion
Serving POST /inference/chat_completion
Serving POST /inference/completion
Serving POST /safety/run_shield
Serving POST /agentic_system/memory_bank/attach
Serving POST /agentic_system/create
Serving POST /agentic_system/session/create
Serving POST /agentic_system/turn/create
Serving POST /agentic_system/delete
Serving POST /agentic_system/session/delete
Serving POST /agentic_system/memory_bank/detach
Serving POST /agentic_system/session/get
Serving POST /agentic_system/step/get
Serving POST /agentic_system/turn/get
Listening on :::5000
INFO: Started server process [453333]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
```
> [!NOTE]
> Configuration is in `~/.llama/builds/local/conda/8b-instruct-run.yaml`. Feel free to increase `max_seq_len`.
> [!IMPORTANT]
> The "local" distribution inference server currently only supports CUDA. It will not work on Apple Silicon machines.
> [!TIP]
> You might need to use the flag `--disable-ipv6` to Disable IPv6 support
This server is running a Llama model locally.
### Step 4. Test with Client
Once the server is setup, we can test it with a client to see the example outputs.
```
cd /path/to/llama-stack
conda activate <env> # any environment containing the llama-stack pip package will work
python -m llama_stack.apis.inference.client localhost 5000
```
This will run the chat completion client and query the distributions /inference/chat_completion API.
Here is an example output:
```
User>hello world, write me a 2 sentence poem about the moon
Assistant> Here's a 2-sentence poem about the moon:
The moon glows softly in the midnight sky,
A beacon of wonder, as it passes by.
```
Similarly you can test safety (if you configured llama-guard and/or prompt-guard shields) by:
```
python -m llama_stack.apis.safety.client localhost 5000
```
Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.

40
docs/source/index.md Normal file
View file

@ -0,0 +1,40 @@
# llama-stack documentation
Llama Stack defines and standardizes the building blocks needed to bring generative AI applications to market. It empowers developers building agentic applications by giving them options to operate in various environments (on-prem, cloud, single-node, on-device) while relying on a standard API interface and the same DevEx that is certified by Meta.
The Llama Stack defines and standardizes the building blocks needed to bring generative AI applications to market. These blocks span the entire development lifecycle: from model training and fine-tuning, through product evaluation, to building and running AI agents in production. Beyond definition, we are building providers for the Llama Stack APIs. These were developing open-source versions and partnering with providers, ensuring developers can assemble AI solutions using consistent, interlocking pieces across platforms. The ultimate goal is to accelerate innovation in the AI space.
The Stack APIs are rapidly improving, but still very much work in progress and we invite feedback as well as direct contributions.
![Llama Stack](../_static/llama-stack.png)
## APIs
The Llama Stack consists of the following set of APIs:
- Inference
- Safety
- Memory
- Agentic System
- Evaluation
- Post Training
- Synthetic Data Generation
- Reward Scoring
Each of the APIs themselves is a collection of REST endpoints.
## API Providers
A Provider is what makes the API real -- they provide the actual implementation backing the API.
As an example, for Inference, we could have the implementation be backed by open source libraries like [ torch | vLLM | TensorRT ] as possible options.
A provider can also be just a pointer to a remote REST service -- for example, cloud providers or dedicated inference providers could serve these APIs.
## Distribution
A Distribution is where APIs and Providers are assembled together to provide a consistent whole to the end application developer. You can mix-and-match providers -- some could be backed by local code and some could be remote. As a hobbyist, you can serve a small model locally, but can choose a cloud provider for a large model. Regardless, the higher level APIs your app needs to work with don't need to change at all. You can even imagine moving across the server / mobile-device boundary as well always using the same uniform set of APIs for developing Generative AI applications.
```{toctree}
cli_reference.md
getting_started.md
```

View file

@ -421,10 +421,8 @@ class Agents(Protocol):
agent_config: AgentConfig, agent_config: AgentConfig,
) -> AgentCreateResponse: ... ) -> AgentCreateResponse: ...
# This method is not `async def` because it can result in either an
# `AsyncGenerator` or a `AgentTurnCreateResponse` depending on the value of `stream`.
@webmethod(route="/agents/turn/create") @webmethod(route="/agents/turn/create")
def create_agent_turn( async def create_agent_turn(
self, self,
agent_id: str, agent_id: str,
session_id: str, session_id: str,

View file

@ -67,14 +67,14 @@ class AgentsClient(Agents):
response.raise_for_status() response.raise_for_status()
return AgentSessionCreateResponse(**response.json()) return AgentSessionCreateResponse(**response.json())
def create_agent_turn( async def create_agent_turn(
self, self,
request: AgentTurnCreateRequest, request: AgentTurnCreateRequest,
) -> AsyncGenerator: ) -> AsyncGenerator:
if request.stream: if request.stream:
return self._stream_agent_turn(request) return self._stream_agent_turn(request)
else: else:
return self._nonstream_agent_turn(request) return await self._nonstream_agent_turn(request)
async def _stream_agent_turn( async def _stream_agent_turn(
self, request: AgentTurnCreateRequest self, request: AgentTurnCreateRequest
@ -126,7 +126,7 @@ async def _run_agent(
for content in user_prompts: for content in user_prompts:
cprint(f"User> {content}", color="white", attrs=["bold"]) cprint(f"User> {content}", color="white", attrs=["bold"])
iterator = api.create_agent_turn( iterator = await api.create_agent_turn(
AgentTurnCreateRequest( AgentTurnCreateRequest(
agent_id=create_response.agent_id, agent_id=create_response.agent_id,
session_id=session_response.session_id, session_id=session_response.session_id,

View file

@ -180,5 +180,5 @@ class EventLogger:
color="cyan", color="cyan",
) )
preivous_event_type = event_type previous_event_type = event_type
previous_step_type = step_type previous_step_type = step_type

View file

@ -42,10 +42,10 @@ class InferenceClient(Inference):
async def shutdown(self) -> None: async def shutdown(self) -> None:
pass pass
def completion(self, request: CompletionRequest) -> AsyncGenerator: async def completion(self, request: CompletionRequest) -> AsyncGenerator:
raise NotImplementedError() raise NotImplementedError()
def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],
@ -139,7 +139,8 @@ async def run_main(
else: else:
logprobs_config = None logprobs_config = None
iterator = client.chat_completion( assert stream, "Non streaming not supported here"
iterator = await client.chat_completion(
model=model, model=model,
messages=[message], messages=[message],
stream=stream, stream=stream,

View file

@ -88,7 +88,8 @@ class CompletionRequest(BaseModel):
class CompletionResponse(BaseModel): class CompletionResponse(BaseModel):
"""Completion response.""" """Completion response."""
completion_message: CompletionMessage content: str
stop_reason: StopReason
logprobs: Optional[List[TokenLogProbs]] = None logprobs: Optional[List[TokenLogProbs]] = None
@ -113,7 +114,7 @@ class BatchCompletionRequest(BaseModel):
class BatchCompletionResponse(BaseModel): class BatchCompletionResponse(BaseModel):
"""Batch completion response.""" """Batch completion response."""
completion_message_batch: List[CompletionMessage] batch: List[CompletionResponse]
@json_schema_type @json_schema_type
@ -165,7 +166,7 @@ class BatchChatCompletionRequest(BaseModel):
@json_schema_type @json_schema_type
class BatchChatCompletionResponse(BaseModel): class BatchChatCompletionResponse(BaseModel):
completion_message_batch: List[CompletionMessage] batch: List[ChatCompletionResponse]
@json_schema_type @json_schema_type
@ -181,10 +182,8 @@ class ModelStore(Protocol):
class Inference(Protocol): class Inference(Protocol):
model_store: ModelStore model_store: ModelStore
# This method is not `async def` because it can result in either an
# `AsyncGenerator` or a `CompletionResponse` depending on the value of `stream`.
@webmethod(route="/inference/completion") @webmethod(route="/inference/completion")
def completion( async def completion(
self, self,
model: str, model: str,
content: InterleavedTextMedia, content: InterleavedTextMedia,
@ -196,7 +195,7 @@ class Inference(Protocol):
# This method is not `async def` because it can result in either an # This method is not `async def` because it can result in either an
# `AsyncGenerator` or a `ChatCompletionResponse` depending on the value of `stream`. # `AsyncGenerator` or a `ChatCompletionResponse` depending on the value of `stream`.
@webmethod(route="/inference/chat_completion") @webmethod(route="/inference/chat_completion")
def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],

View file

@ -92,6 +92,21 @@ async def run_main(host: str, port: int, stream: bool):
response = await client.list_memory_banks() response = await client.list_memory_banks()
cprint(f"list_memory_banks response={response}", "green") cprint(f"list_memory_banks response={response}", "green")
# register memory bank for the first time
response = await client.register_memory_bank(
VectorMemoryBankDef(
identifier="test_bank2",
embedding_model="all-MiniLM-L6-v2",
chunk_size_in_tokens=512,
overlap_size_in_tokens=64,
)
)
cprint(f"register_memory_bank response={response}", "blue")
# list again after registering
response = await client.list_memory_banks()
cprint(f"list_memory_banks response={response}", "green")
def main(host: str, port: int, stream: bool = True): def main(host: str, port: int, stream: bool = True):
asyncio.run(run_main(host, port, stream)) asyncio.run(run_main(host, port, stream))

View file

@ -152,27 +152,29 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
parser.error("Please provide a model id") parser.error("Please provide a model id")
return return
prompt_guard = prompt_guard_model_sku() # Check if model_id is a comma-separated list
if args.model_id == prompt_guard.model_id: model_ids = [model_id.strip() for model_id in args.model_id.split(",")]
model = prompt_guard
info = prompt_guard_download_info()
else:
model = resolve_model(args.model_id)
if model is None:
parser.error(f"Model {args.model_id} not found")
return
info = llama_meta_net_info(model)
if args.source == "huggingface": prompt_guard = prompt_guard_model_sku()
_hf_download(model, args.hf_token, args.ignore_patterns, parser) for model_id in model_ids:
else: if model_id == prompt_guard.model_id:
meta_url = args.meta_url model = prompt_guard
if not meta_url: info = prompt_guard_download_info()
meta_url = input( else:
"Please provide the signed URL you received via email after visiting https://www.llama.com/llama-downloads/ (e.g., https://llama3-1.llamameta.net/*?Policy...): " model = resolve_model(model_id)
if model is None:
parser.error(f"Model {model_id} not found")
continue
info = llama_meta_net_info(model)
if args.source == "huggingface":
_hf_download(model, args.hf_token, args.ignore_patterns, parser)
else:
meta_url = args.meta_url or input(
f"Please provide the signed URL for model {model_id} you received via email after visiting https://www.llama.com/llama-downloads/ (e.g., https://llama3-1.llamameta.net/*?Policy...): "
) )
assert meta_url is not None and "llamameta.net" in meta_url assert "llamameta.net" in meta_url
_meta_download(model, meta_url, info) _meta_download(model, meta_url, info)
class ModelEntry(BaseModel): class ModelEntry(BaseModel):

View file

@ -13,7 +13,7 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
TEMPLATES_PATH = ( TEMPLATES_PATH = (
Path(os.path.relpath(__file__)).parent.parent.parent / "distribution" / "templates" Path(os.path.relpath(__file__)).parent.parent.parent.parent / "distributions"
) )

View file

@ -15,7 +15,7 @@ special_pip_deps="$6"
set -euo pipefail set -euo pipefail
build_name="$1" build_name="$1"
image_name="llamastack-$build_name" image_name="distribution-$build_name"
docker_base=$2 docker_base=$2
build_file_path=$3 build_file_path=$3
host_build_dir=$4 host_build_dir=$4

View file

@ -55,7 +55,7 @@ class ProviderWithSpec(Provider):
# TODO: this code is not very straightforward to follow and needs one more round of refactoring # TODO: this code is not very straightforward to follow and needs one more round of refactoring
async def resolve_impls_with_routing(run_config: StackRunConfig) -> Dict[Api, Any]: async def resolve_impls(run_config: StackRunConfig) -> Dict[Api, Any]:
""" """
Does two things: Does two things:
- flatmaps, sorts and resolves the providers in dependency order - flatmaps, sorts and resolves the providers in dependency order

View file

@ -70,7 +70,7 @@ class InferenceRouter(Inference):
async def register_model(self, model: ModelDef) -> None: async def register_model(self, model: ModelDef) -> None:
await self.routing_table.register_model(model) await self.routing_table.register_model(model)
def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],
@ -93,11 +93,11 @@ class InferenceRouter(Inference):
) )
provider = self.routing_table.get_provider_impl(model) provider = self.routing_table.get_provider_impl(model)
if stream: if stream:
return (chunk async for chunk in provider.chat_completion(**params)) return (chunk async for chunk in await provider.chat_completion(**params))
else: else:
return provider.chat_completion(**params) return await provider.chat_completion(**params)
def completion( async def completion(
self, self,
model: str, model: str,
content: InterleavedTextMedia, content: InterleavedTextMedia,
@ -114,9 +114,9 @@ class InferenceRouter(Inference):
logprobs=logprobs, logprobs=logprobs,
) )
if stream: if stream:
return (chunk async for chunk in provider.completion(**params)) return (chunk async for chunk in await provider.completion(**params))
else: else:
return provider.completion(**params) return await provider.completion(**params)
async def embeddings( async def embeddings(
self, self,

View file

@ -87,8 +87,21 @@ class CommonRoutingTableImpl(RoutingTable):
def get_provider_impl( def get_provider_impl(
self, routing_key: str, provider_id: Optional[str] = None self, routing_key: str, provider_id: Optional[str] = None
) -> Any: ) -> Any:
def apiname_object():
if isinstance(self, ModelsRoutingTable):
return ("Inference", "model")
elif isinstance(self, ShieldsRoutingTable):
return ("Safety", "shield")
elif isinstance(self, MemoryBanksRoutingTable):
return ("Memory", "memory_bank")
else:
raise ValueError("Unknown routing table type")
if routing_key not in self.registry: if routing_key not in self.registry:
raise ValueError(f"`{routing_key}` not registered") apiname, objname = apiname_object()
raise ValueError(
f"`{routing_key}` not registered. Make sure there is an {apiname} provider serving this {objname}."
)
objs = self.registry[routing_key] objs = self.registry[routing_key]
for obj in objs: for obj in objs:
@ -110,10 +123,16 @@ class CommonRoutingTableImpl(RoutingTable):
async def register_object(self, obj: RoutableObjectWithProvider): async def register_object(self, obj: RoutableObjectWithProvider):
entries = self.registry.get(obj.identifier, []) entries = self.registry.get(obj.identifier, [])
for entry in entries: for entry in entries:
if entry.provider_id == obj.provider_id: if entry.provider_id == obj.provider_id or not obj.provider_id:
print(f"`{obj.identifier}` already registered with `{obj.provider_id}`") print(
f"`{obj.identifier}` already registered with `{entry.provider_id}`"
)
return return
# if provider_id is not specified, we'll pick an arbitrary one from existing entries
if not obj.provider_id and len(self.impls_by_provider_id) > 0:
obj.provider_id = list(self.impls_by_provider_id.keys())[0]
if obj.provider_id not in self.impls_by_provider_id: if obj.provider_id not in self.impls_by_provider_id:
raise ValueError(f"Provider `{obj.provider_id}` not found") raise ValueError(f"Provider `{obj.provider_id}` not found")

View file

@ -37,7 +37,7 @@ from llama_stack.providers.utils.telemetry.tracing import (
from llama_stack.distribution.datatypes import * # noqa: F403 from llama_stack.distribution.datatypes import * # noqa: F403
from llama_stack.distribution.request_headers import set_request_provider_data from llama_stack.distribution.request_headers import set_request_provider_data
from llama_stack.distribution.resolver import resolve_impls_with_routing from llama_stack.distribution.resolver import resolve_impls
from .endpoints import get_all_api_endpoints from .endpoints import get_all_api_endpoints
@ -203,7 +203,7 @@ async def maybe_await(value):
async def sse_generator(event_gen): async def sse_generator(event_gen):
try: try:
async for item in event_gen: async for item in await event_gen:
yield create_sse_event(item) yield create_sse_event(item)
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
except asyncio.CancelledError: except asyncio.CancelledError:
@ -276,7 +276,7 @@ def main(
app = FastAPI() app = FastAPI()
impls = asyncio.run(resolve_impls_with_routing(config)) impls = asyncio.run(resolve_impls(config))
if Api.telemetry in impls: if Api.telemetry in impls:
setup_logger(impls[Api.telemetry]) setup_logger(impls[Api.telemetry])

View file

@ -1,15 +0,0 @@
name: local-cpu
distribution_spec:
description: remote inference + local safety/agents/memory
docker_image: null
providers:
inference:
- remote::ollama
- remote::tgi
- remote::together
- remote::fireworks
safety: meta-reference
agents: meta-reference
memory: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -1,11 +0,0 @@
name: local-gpu
distribution_spec:
description: local meta reference
docker_image: null
providers:
inference: meta-reference
safety: meta-reference
agents: meta-reference
memory: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -1,10 +0,0 @@
name: local-ollama
distribution_spec:
description: Like local, but use ollama for running LLM inference
providers:
inference: remote::ollama
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -1,10 +0,0 @@
name: local-tgi
distribution_spec:
description: Like local, but use a TGI server for running LLM inference.
providers:
inference: remote::tgi
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -47,7 +47,7 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
async def shutdown(self) -> None: async def shutdown(self) -> None:
self.client.close() self.client.close()
def completion( async def completion(
self, self,
model: str, model: str,
content: InterleavedTextMedia, content: InterleavedTextMedia,
@ -283,7 +283,7 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
) )
return tool_config return tool_config
def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],

View file

@ -7,10 +7,11 @@
from .config import DatabricksImplConfig from .config import DatabricksImplConfig
from .databricks import DatabricksInferenceAdapter from .databricks import DatabricksInferenceAdapter
async def get_adapter_impl(config: DatabricksImplConfig, _deps): async def get_adapter_impl(config: DatabricksImplConfig, _deps):
assert isinstance( assert isinstance(
config, DatabricksImplConfig config, DatabricksImplConfig
), f"Unexpected config type: {type(config)}" ), f"Unexpected config type: {type(config)}"
impl = DatabricksInferenceAdapter(config) impl = DatabricksInferenceAdapter(config)
await impl.initialize() await impl.initialize()
return impl return impl

View file

@ -4,7 +4,6 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import Optional
from llama_models.schema_utils import json_schema_type from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@ -19,4 +18,4 @@ class DatabricksImplConfig(BaseModel):
api_token: str = Field( api_token: str = Field(
default=None, default=None,
description="The Databricks API token", description="The Databricks API token",
) )

View file

@ -48,10 +48,17 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
async def shutdown(self) -> None: async def shutdown(self) -> None:
pass pass
def completion(self, request: CompletionRequest) -> AsyncGenerator: async def completion(
self,
model: str,
content: InterleavedTextMedia,
sampling_params: Optional[SamplingParams] = SamplingParams(),
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> AsyncGenerator:
raise NotImplementedError() raise NotImplementedError()
def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],
@ -77,14 +84,14 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
if stream: if stream:
return self._stream_chat_completion(request, client) return self._stream_chat_completion(request, client)
else: else:
return self._nonstream_chat_completion(request, client) return await self._nonstream_chat_completion(request, client)
async def _nonstream_chat_completion( async def _nonstream_chat_completion(
self, request: ChatCompletionRequest, client: OpenAI self, request: ChatCompletionRequest, client: OpenAI
) -> ChatCompletionResponse: ) -> ChatCompletionResponse:
params = self._get_params(request) params = self._get_params(request)
r = client.completions.create(**params) r = client.completions.create(**params)
return process_chat_completion_response(request, r, self.formatter) return process_chat_completion_response(r, self.formatter)
async def _stream_chat_completion( async def _stream_chat_completion(
self, request: ChatCompletionRequest, client: OpenAI self, request: ChatCompletionRequest, client: OpenAI
@ -98,7 +105,7 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
stream = _to_async_generator() stream = _to_async_generator()
async for chunk in process_chat_completion_stream_response( async for chunk in process_chat_completion_stream_response(
request, stream, self.formatter stream, self.formatter
): ):
yield chunk yield chunk

View file

@ -51,7 +51,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference):
async def shutdown(self) -> None: async def shutdown(self) -> None:
pass pass
def completion( async def completion(
self, self,
model: str, model: str,
content: InterleavedTextMedia, content: InterleavedTextMedia,
@ -61,7 +61,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference):
) -> AsyncGenerator: ) -> AsyncGenerator:
raise NotImplementedError() raise NotImplementedError()
def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],
@ -87,14 +87,14 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference):
if stream: if stream:
return self._stream_chat_completion(request, client) return self._stream_chat_completion(request, client)
else: else:
return self._nonstream_chat_completion(request, client) return await self._nonstream_chat_completion(request, client)
async def _nonstream_chat_completion( async def _nonstream_chat_completion(
self, request: ChatCompletionRequest, client: Fireworks self, request: ChatCompletionRequest, client: Fireworks
) -> ChatCompletionResponse: ) -> ChatCompletionResponse:
params = self._get_params(request) params = self._get_params(request)
r = await client.completion.acreate(**params) r = await client.completion.acreate(**params)
return process_chat_completion_response(request, r, self.formatter) return process_chat_completion_response(r, self.formatter)
async def _stream_chat_completion( async def _stream_chat_completion(
self, request: ChatCompletionRequest, client: Fireworks self, request: ChatCompletionRequest, client: Fireworks
@ -103,7 +103,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference):
stream = client.completion.acreate(**params) stream = client.completion.acreate(**params)
async for chunk in process_chat_completion_stream_response( async for chunk in process_chat_completion_stream_response(
request, stream, self.formatter stream, self.formatter
): ):
yield chunk yield chunk

View file

@ -23,9 +23,12 @@ from llama_stack.providers.utils.inference.openai_compat import (
OpenAICompatCompletionResponse, OpenAICompatCompletionResponse,
process_chat_completion_response, process_chat_completion_response,
process_chat_completion_stream_response, process_chat_completion_stream_response,
process_completion_response,
process_completion_stream_response,
) )
from llama_stack.providers.utils.inference.prompt_adapter import ( from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt, chat_completion_request_to_prompt,
completion_request_to_prompt,
) )
OLLAMA_SUPPORTED_MODELS = { OLLAMA_SUPPORTED_MODELS = {
@ -33,7 +36,8 @@ OLLAMA_SUPPORTED_MODELS = {
"Llama3.1-70B-Instruct": "llama3.1:70b-instruct-fp16", "Llama3.1-70B-Instruct": "llama3.1:70b-instruct-fp16",
"Llama3.2-1B-Instruct": "llama3.2:1b-instruct-fp16", "Llama3.2-1B-Instruct": "llama3.2:1b-instruct-fp16",
"Llama3.2-3B-Instruct": "llama3.2:3b-instruct-fp16", "Llama3.2-3B-Instruct": "llama3.2:3b-instruct-fp16",
"Llama-Guard-3-8B": "xe/llamaguard3:latest", "Llama-Guard-3-8B": "llama-guard3:8b",
"Llama-Guard-3-1B": "llama-guard3:1b",
} }
@ -84,7 +88,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
return ret return ret
def completion( async def completion(
self, self,
model: str, model: str,
content: InterleavedTextMedia, content: InterleavedTextMedia,
@ -92,9 +96,66 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
stream: Optional[bool] = False, stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> AsyncGenerator: ) -> AsyncGenerator:
raise NotImplementedError() request = CompletionRequest(
model=model,
content=content,
sampling_params=sampling_params,
stream=stream,
logprobs=logprobs,
)
if stream:
return self._stream_completion(request)
else:
return await self._nonstream_completion(request)
def chat_completion( def _get_params_for_completion(self, request: CompletionRequest) -> dict:
sampling_options = get_sampling_options(request)
# This is needed since the Ollama API expects num_predict to be set
# for early truncation instead of max_tokens.
if sampling_options["max_tokens"] is not None:
sampling_options["num_predict"] = sampling_options["max_tokens"]
return {
"model": OLLAMA_SUPPORTED_MODELS[request.model],
"prompt": completion_request_to_prompt(request, self.formatter),
"options": sampling_options,
"raw": True,
"stream": request.stream,
}
async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
params = self._get_params_for_completion(request)
async def _generate_and_convert_to_openai_compat():
s = await self.client.generate(**params)
async for chunk in s:
choice = OpenAICompatCompletionChoice(
finish_reason=chunk["done_reason"] if chunk["done"] else None,
text=chunk["response"],
)
yield OpenAICompatCompletionResponse(
choices=[choice],
)
stream = _generate_and_convert_to_openai_compat()
async for chunk in process_completion_stream_response(stream, self.formatter):
yield chunk
async def _nonstream_completion(self, request: CompletionRequest) -> AsyncGenerator:
params = self._get_params_for_completion(request)
r = await self.client.generate(**params)
assert isinstance(r, dict)
choice = OpenAICompatCompletionChoice(
finish_reason=r["done_reason"] if r["done"] else None,
text=r["response"],
)
response = OpenAICompatCompletionResponse(
choices=[choice],
)
return process_completion_response(response, self.formatter)
async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],
@ -118,7 +179,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
if stream: if stream:
return self._stream_chat_completion(request) return self._stream_chat_completion(request)
else: else:
return self._nonstream_chat_completion(request) return await self._nonstream_chat_completion(request)
def _get_params(self, request: ChatCompletionRequest) -> dict: def _get_params(self, request: ChatCompletionRequest) -> dict:
return { return {
@ -143,7 +204,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
response = OpenAICompatCompletionResponse( response = OpenAICompatCompletionResponse(
choices=[choice], choices=[choice],
) )
return process_chat_completion_response(request, response, self.formatter) return process_chat_completion_response(response, self.formatter)
async def _stream_chat_completion( async def _stream_chat_completion(
self, request: ChatCompletionRequest self, request: ChatCompletionRequest
@ -163,7 +224,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
stream = _generate_and_convert_to_openai_compat() stream = _generate_and_convert_to_openai_compat()
async for chunk in process_chat_completion_stream_response( async for chunk in process_chat_completion_stream_response(
request, stream, self.formatter stream, self.formatter
): ):
yield chunk yield chunk

View file

@ -66,7 +66,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
async def shutdown(self) -> None: async def shutdown(self) -> None:
pass pass
def completion( async def completion(
self, self,
model: str, model: str,
content: InterleavedTextMedia, content: InterleavedTextMedia,
@ -76,7 +76,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
) -> AsyncGenerator: ) -> AsyncGenerator:
raise NotImplementedError() raise NotImplementedError()
def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],
@ -101,7 +101,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
if stream: if stream:
return self._stream_chat_completion(request) return self._stream_chat_completion(request)
else: else:
return self._nonstream_chat_completion(request) return await self._nonstream_chat_completion(request)
async def _nonstream_chat_completion( async def _nonstream_chat_completion(
self, request: ChatCompletionRequest self, request: ChatCompletionRequest
@ -116,7 +116,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
response = OpenAICompatCompletionResponse( response = OpenAICompatCompletionResponse(
choices=[choice], choices=[choice],
) )
return process_chat_completion_response(request, response, self.formatter) return process_chat_completion_response(response, self.formatter)
async def _stream_chat_completion( async def _stream_chat_completion(
self, request: ChatCompletionRequest self, request: ChatCompletionRequest
@ -135,7 +135,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
stream = _generate_and_convert_to_openai_compat() stream = _generate_and_convert_to_openai_compat()
async for chunk in process_chat_completion_stream_response( async for chunk in process_chat_completion_stream_response(
request, stream, self.formatter stream, self.formatter
): ):
yield chunk yield chunk

View file

@ -64,7 +64,7 @@ class TogetherInferenceAdapter(
) -> AsyncGenerator: ) -> AsyncGenerator:
raise NotImplementedError() raise NotImplementedError()
def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],
@ -101,14 +101,14 @@ class TogetherInferenceAdapter(
if stream: if stream:
return self._stream_chat_completion(request, client) return self._stream_chat_completion(request, client)
else: else:
return self._nonstream_chat_completion(request, client) return await self._nonstream_chat_completion(request, client)
async def _nonstream_chat_completion( async def _nonstream_chat_completion(
self, request: ChatCompletionRequest, client: Together self, request: ChatCompletionRequest, client: Together
) -> ChatCompletionResponse: ) -> ChatCompletionResponse:
params = self._get_params(request) params = self._get_params(request)
r = client.completions.create(**params) r = client.completions.create(**params)
return process_chat_completion_response(request, r, self.formatter) return process_chat_completion_response(r, self.formatter)
async def _stream_chat_completion( async def _stream_chat_completion(
self, request: ChatCompletionRequest, client: Together self, request: ChatCompletionRequest, client: Together
@ -123,7 +123,7 @@ class TogetherInferenceAdapter(
stream = _to_async_generator() stream = _to_async_generator()
async for chunk in process_chat_completion_stream_response( async for chunk in process_chat_completion_stream_response(
request, stream, self.formatter stream, self.formatter
): ):
yield chunk yield chunk

View file

@ -0,0 +1,15 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .config import VLLMImplConfig
from .vllm import VLLMInferenceAdapter
async def get_adapter_impl(config: VLLMImplConfig, _deps):
assert isinstance(config, VLLMImplConfig), f"Unexpected config type: {type(config)}"
impl = VLLMInferenceAdapter(config)
await impl.initialize()
return impl

View file

@ -0,0 +1,22 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Optional
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field
@json_schema_type
class VLLMImplConfig(BaseModel):
url: Optional[str] = Field(
default=None,
description="The URL for the vLLM model serving endpoint",
)
api_token: Optional[str] = Field(
default=None,
description="The API token",
)

View file

@ -0,0 +1,152 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import AsyncGenerator
from llama_models.llama3.api.chat_format import ChatFormat
from llama_models.llama3.api.datatypes import Message
from llama_models.llama3.api.tokenizer import Tokenizer
from openai import OpenAI
from llama_stack.apis.inference import * # noqa: F403
from llama_stack.providers.datatypes import ModelsProtocolPrivate
from llama_stack.providers.utils.inference.openai_compat import (
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
)
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
)
from .config import VLLMImplConfig
VLLM_SUPPORTED_MODELS = {
"Llama3.1-8B": "meta-llama/Llama-3.1-8B",
"Llama3.1-70B": "meta-llama/Llama-3.1-70B",
"Llama3.1-405B:bf16-mp8": "meta-llama/Llama-3.1-405B",
"Llama3.1-405B": "meta-llama/Llama-3.1-405B-FP8",
"Llama3.1-405B:bf16-mp16": "meta-llama/Llama-3.1-405B",
"Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct",
"Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct",
"Llama3.1-405B-Instruct:bf16-mp8": "meta-llama/Llama-3.1-405B-Instruct",
"Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-FP8",
"Llama3.1-405B-Instruct:bf16-mp16": "meta-llama/Llama-3.1-405B-Instruct",
"Llama3.2-1B": "meta-llama/Llama-3.2-1B",
"Llama3.2-3B": "meta-llama/Llama-3.2-3B",
"Llama3.2-11B-Vision": "meta-llama/Llama-3.2-11B-Vision",
"Llama3.2-90B-Vision": "meta-llama/Llama-3.2-90B-Vision",
"Llama3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct",
"Llama3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
"Llama3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
"Llama3.2-90B-Vision-Instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
"Llama-Guard-3-11B-Vision": "meta-llama/Llama-Guard-3-11B-Vision",
"Llama-Guard-3-1B:int4-mp1": "meta-llama/Llama-Guard-3-1B-INT4",
"Llama-Guard-3-1B": "meta-llama/Llama-Guard-3-1B",
"Llama-Guard-3-8B": "meta-llama/Llama-Guard-3-8B",
"Llama-Guard-3-8B:int8-mp1": "meta-llama/Llama-Guard-3-8B-INT8",
"Prompt-Guard-86M": "meta-llama/Prompt-Guard-86M",
"Llama-Guard-2-8B": "meta-llama/Llama-Guard-2-8B",
}
class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
def __init__(self, config: VLLMImplConfig) -> None:
self.config = config
self.formatter = ChatFormat(Tokenizer.get_instance())
self.client = None
async def initialize(self) -> None:
self.client = OpenAI(base_url=self.config.url, api_key=self.config.api_token)
async def register_model(self, model: ModelDef) -> None:
raise ValueError("Model registration is not supported for vLLM models")
async def shutdown(self) -> None:
pass
async def list_models(self) -> List[ModelDef]:
return [
ModelDef(identifier=model.id, llama_model=model.id)
for model in self.client.models.list()
]
def completion(
self,
model: str,
content: InterleavedTextMedia,
sampling_params: Optional[SamplingParams] = SamplingParams(),
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
raise NotImplementedError()
def chat_completion(
self,
model: str,
messages: List[Message],
sampling_params: Optional[SamplingParams] = SamplingParams(),
tools: Optional[List[ToolDefinition]] = None,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> AsyncGenerator:
request = ChatCompletionRequest(
model=model,
messages=messages,
sampling_params=sampling_params,
tools=tools or [],
tool_choice=tool_choice,
tool_prompt_format=tool_prompt_format,
stream=stream,
logprobs=logprobs,
)
if stream:
return self._stream_chat_completion(request, self.client)
else:
return self._nonstream_chat_completion(request, self.client)
async def _nonstream_chat_completion(
self, request: ChatCompletionRequest, client: OpenAI
) -> ChatCompletionResponse:
params = self._get_params(request)
r = client.completions.create(**params)
return process_chat_completion_response(request, r, self.formatter)
async def _stream_chat_completion(
self, request: ChatCompletionRequest, client: OpenAI
) -> AsyncGenerator:
params = self._get_params(request)
# TODO: Can we use client.completions.acreate() or maybe there is another way to directly create an async
# generator so this wrapper is not necessary?
async def _to_async_generator():
s = client.completions.create(**params)
for chunk in s:
yield chunk
stream = _to_async_generator()
async for chunk in process_chat_completion_stream_response(
request, stream, self.formatter
):
yield chunk
def _get_params(self, request: ChatCompletionRequest) -> dict:
return {
"model": VLLM_SUPPORTED_MODELS[request.model],
"prompt": chat_completion_request_to_prompt(request, self.formatter),
"stream": request.stream,
**get_sampling_options(request),
}
async def embeddings(
self,
model: str,
contents: List[InterleavedTextMedia],
) -> EmbeddingsResponse:
raise NotImplementedError()

View file

@ -424,7 +424,7 @@ class ChatAgent(ShieldRunnerMixin):
stop_reason = None stop_reason = None
with tracing.span("inference"): with tracing.span("inference"):
async for chunk in self.inference_api.chat_completion( async for chunk in await self.inference_api.chat_completion(
self.agent_config.model, self.agent_config.model,
input_messages, input_messages,
tools=self._get_tools(), tools=self._get_tools(),

View file

@ -105,7 +105,7 @@ class MetaReferenceAgentsImpl(Agents):
session_id=session_id, session_id=session_id,
) )
def create_agent_turn( async def create_agent_turn(
self, self,
agent_id: str, agent_id: str,
session_id: str, session_id: str,

View file

@ -17,13 +17,22 @@ from llama_stack.providers.utils.inference import supported_inference_models
class MetaReferenceInferenceConfig(BaseModel): class MetaReferenceInferenceConfig(BaseModel):
model: str = Field( model: str = Field(
default="Llama3.1-8B-Instruct", default="Llama3.2-3B-Instruct",
description="Model descriptor from `llama model list`", description="Model descriptor from `llama model list`",
) )
torch_seed: Optional[int] = None torch_seed: Optional[int] = None
max_seq_len: int = 4096 max_seq_len: int = 4096
max_batch_size: int = 1 max_batch_size: int = 1
# when this is False, we assume that the distributed process group is setup by someone
# outside of this code (e.g., when run inside `torchrun`). that is useful for clients
# (including our testing code) who might be using llama-stack as a library.
create_distributed_process_group: bool = True
# By default, the implementation will look at ~/.llama/checkpoints/<model> but you
# can override by specifying the directory explicitly
checkpoint_dir: Optional[str] = None
@field_validator("model") @field_validator("model")
@classmethod @classmethod
def validate_model(cls, model: str) -> str: def validate_model(cls, model: str) -> str:

View file

@ -23,11 +23,6 @@ from fairscale.nn.model_parallel.initialize import (
) )
from llama_models.llama3.api.args import ModelArgs from llama_models.llama3.api.args import ModelArgs
from llama_models.llama3.api.chat_format import ChatFormat, ModelInput from llama_models.llama3.api.chat_format import ChatFormat, ModelInput
from llama_models.llama3.api.datatypes import (
InterleavedTextMedia,
Message,
ToolPromptFormat,
)
from llama_models.llama3.api.tokenizer import Tokenizer from llama_models.llama3.api.tokenizer import Tokenizer
from llama_models.llama3.reference_impl.model import Transformer from llama_models.llama3.reference_impl.model import Transformer
from llama_models.llama3.reference_impl.multimodal.model import ( from llama_models.llama3.reference_impl.multimodal.model import (
@ -38,7 +33,11 @@ from llama_models.sku_list import resolve_model
from pydantic import BaseModel from pydantic import BaseModel
from termcolor import cprint from termcolor import cprint
from llama_stack.apis.inference import * # noqa: F403
from llama_stack.distribution.utils.model_utils import model_local_dir from llama_stack.distribution.utils.model_utils import model_local_dir
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_messages,
)
from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
@ -98,7 +97,10 @@ class Llama:
sys.stdout = open(os.devnull, "w") sys.stdout = open(os.devnull, "w")
start_time = time.time() start_time = time.time()
ckpt_dir = model_checkpoint_dir(model) if config.checkpoint_dir:
ckpt_dir = config.checkpoint_dir
else:
ckpt_dir = model_checkpoint_dir(model)
checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}" assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
@ -119,9 +121,7 @@ class Llama:
**params, **params,
) )
tokenizer_path = os.path.join(ckpt_dir, "tokenizer.model") tokenizer = Tokenizer.get_instance()
tokenizer = Tokenizer(model_path=tokenizer_path)
assert ( assert (
model_args.vocab_size == tokenizer.n_words model_args.vocab_size == tokenizer.n_words
), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}" ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
@ -138,7 +138,7 @@ class Llama:
else: else:
model = Transformer(model_args) model = Transformer(model_args)
model.load_state_dict(state_dict, strict=False) model.load_state_dict(state_dict, strict=False)
model = convert_to_quantized_model(model, config) model = convert_to_quantized_model(model, config, ckpt_dir)
else: else:
if torch.cuda.is_bf16_supported(): if torch.cuda.is_bf16_supported():
torch.set_default_tensor_type(torch.cuda.BFloat16Tensor) torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
@ -170,14 +170,16 @@ class Llama:
logprobs: bool = False, logprobs: bool = False,
echo: bool = False, echo: bool = False,
include_stop_token: bool = False, include_stop_token: bool = False,
print_input_tokens: bool = False,
) -> Generator: ) -> Generator:
params = self.model.params params = self.model.params
# input_tokens = [ if print_input_tokens:
# self.formatter.vision_token if t == 128256 else t input_tokens = [
# for t in model_input.tokens self.formatter.vision_token if t == 128256 else t
# ] for t in model_input.tokens
# cprint("Input to model -> " + self.tokenizer.decode(input_tokens), "red") ]
cprint("Input to model -> " + self.tokenizer.decode(input_tokens), "red")
prompt_tokens = [model_input.tokens] prompt_tokens = [model_input.tokens]
bsz = 1 bsz = 1
@ -228,8 +230,7 @@ class Llama:
ignore_index=pad_id, ignore_index=pad_id,
) )
stop_tokens = torch.tensor(self.tokenizer.stop_tokens) stop_tokens = torch.tensor(self.tokenizer.stop_tokens, device="cuda")
for cur_pos in range(min_prompt_len, total_len): for cur_pos in range(min_prompt_len, total_len):
if is_vision: if is_vision:
position_ids = torch.arange( position_ids = torch.arange(
@ -295,15 +296,12 @@ class Llama:
if all(eos_reached): if all(eos_reached):
break break
def text_completion( def completion(
self, self,
content: InterleavedTextMedia, request: CompletionRequest,
temperature: float = 0.6,
top_p: float = 0.9,
max_gen_len: Optional[int] = None,
logprobs: bool = False,
echo: bool = False,
) -> Generator: ) -> Generator:
sampling_params = request.sampling_params
max_gen_len = sampling_params.max_tokens
if ( if (
max_gen_len is None max_gen_len is None
or max_gen_len == 0 or max_gen_len == 0
@ -311,26 +309,25 @@ class Llama:
): ):
max_gen_len = self.model.params.max_seq_len - 1 max_gen_len = self.model.params.max_seq_len - 1
model_input = self.formatter.encode_content(content) model_input = self.formatter.encode_content(request.content)
yield from self.generate( yield from self.generate(
model_input=model_input, model_input=model_input,
max_gen_len=max_gen_len, max_gen_len=max_gen_len,
temperature=temperature, temperature=sampling_params.temperature,
top_p=top_p, top_p=sampling_params.top_p,
logprobs=logprobs, logprobs=bool(request.logprobs),
echo=echo, include_stop_token=True,
echo=False,
) )
def chat_completion( def chat_completion(
self, self,
messages: List[Message], request: ChatCompletionRequest,
temperature: float = 0.6,
top_p: float = 0.9,
max_gen_len: Optional[int] = None,
logprobs: bool = False,
tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
) -> Generator: ) -> Generator:
messages = chat_completion_request_to_messages(request)
sampling_params = request.sampling_params
max_gen_len = sampling_params.max_tokens
if ( if (
max_gen_len is None max_gen_len is None
or max_gen_len == 0 or max_gen_len == 0
@ -341,12 +338,12 @@ class Llama:
yield from self.generate( yield from self.generate(
model_input=self.formatter.encode_dialog_prompt( model_input=self.formatter.encode_dialog_prompt(
messages, messages,
tool_prompt_format, request.tool_prompt_format,
), ),
max_gen_len=max_gen_len, max_gen_len=max_gen_len,
temperature=temperature, temperature=sampling_params.temperature,
top_p=top_p, top_p=sampling_params.top_p,
logprobs=logprobs, logprobs=bool(request.logprobs),
include_stop_token=True, include_stop_token=True,
) )

View file

@ -13,11 +13,9 @@ from llama_models.sku_list import resolve_model
from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403
from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_messages,
)
from .config import MetaReferenceInferenceConfig from .config import MetaReferenceInferenceConfig
from .generation import Llama
from .model_parallel import LlamaModelParallelGenerator from .model_parallel import LlamaModelParallelGenerator
# there's a single model parallel process running serving the model. for now, # there's a single model parallel process running serving the model. for now,
@ -36,8 +34,11 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
async def initialize(self) -> None: async def initialize(self) -> None:
print(f"Loading model `{self.model.descriptor()}`") print(f"Loading model `{self.model.descriptor()}`")
self.generator = LlamaModelParallelGenerator(self.config) if self.config.create_distributed_process_group:
self.generator.start() self.generator = LlamaModelParallelGenerator(self.config)
self.generator.start()
else:
self.generator = Llama.build(self.config)
async def register_model(self, model: ModelDef) -> None: async def register_model(self, model: ModelDef) -> None:
raise ValueError("Dynamic model registration is not supported") raise ValueError("Dynamic model registration is not supported")
@ -51,9 +52,21 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
] ]
async def shutdown(self) -> None: async def shutdown(self) -> None:
self.generator.stop() if self.config.create_distributed_process_group:
self.generator.stop()
def completion( def check_model(self, request) -> None:
model = resolve_model(request.model)
if model is None:
raise RuntimeError(
f"Unknown model: {request.model}, Run `llama model list`"
)
elif model.descriptor() != self.model.descriptor():
raise RuntimeError(
f"Model mismatch: {request.model} != {self.model.descriptor()}"
)
async def completion(
self, self,
model: str, model: str,
content: InterleavedTextMedia, content: InterleavedTextMedia,
@ -61,9 +74,114 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
stream: Optional[bool] = False, stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> Union[CompletionResponse, CompletionResponseStreamChunk]: ) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
raise NotImplementedError() if logprobs:
assert logprobs.top_k == 1, f"Unexpected top_k={logprobs.top_k}"
def chat_completion( request = CompletionRequest(
model=model,
content=content,
sampling_params=sampling_params,
stream=stream,
logprobs=logprobs,
)
self.check_model(request)
if request.stream:
return self._stream_completion(request)
else:
return await self._nonstream_completion(request)
async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
def impl():
stop_reason = None
for token_result in self.generator.completion(request):
if token_result.text == "<|eot_id|>":
stop_reason = StopReason.end_of_turn
text = ""
elif token_result.text == "<|eom_id|>":
stop_reason = StopReason.end_of_message
text = ""
else:
text = token_result.text
logprobs = None
if stop_reason is None:
if request.logprobs:
assert len(token_result.logprobs) == 1
logprobs = [
TokenLogProbs(
logprobs_by_token={
token_result.text: token_result.logprobs[0]
}
)
]
yield CompletionResponseStreamChunk(
delta=text,
stop_reason=stop_reason,
logprobs=logprobs if request.logprobs else None,
)
if stop_reason is None:
yield CompletionResponseStreamChunk(
delta="",
stop_reason=StopReason.out_of_tokens,
)
if self.config.create_distributed_process_group:
async with SEMAPHORE:
for x in impl():
yield x
else:
for x in impl():
yield x
async def _nonstream_completion(
self, request: CompletionRequest
) -> CompletionResponse:
def impl():
tokens = []
logprobs = []
stop_reason = None
tokenizer = self.generator.formatter.tokenizer
for token_result in self.generator.completion(request):
tokens.append(token_result.token)
if token_result.token in tokenizer.stop_tokens:
# not quite right semantically
stop_reason = StopReason.end_of_turn
if request.logprobs:
assert len(token_result.logprobs) == 1
logprobs.append(
TokenLogProbs(
logprobs_by_token={
token_result.text: token_result.logprobs[0]
}
)
)
if stop_reason is None:
stop_reason = StopReason.out_of_tokens
content = self.generator.formatter.tokenizer.decode(tokens)
return CompletionResponse(
content=content,
stop_reason=stop_reason,
logprobs=logprobs if request.logprobs else None,
)
if self.config.create_distributed_process_group:
async with SEMAPHORE:
return impl()
else:
return impl()
async def chat_completion(
self, self,
model: str, model: str,
messages: List[Message], messages: List[Message],
@ -88,43 +206,26 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
stream=stream, stream=stream,
logprobs=logprobs, logprobs=logprobs,
) )
self.check_model(request)
model = resolve_model(request.model) if self.config.create_distributed_process_group:
if model is None: if SEMAPHORE.locked():
raise RuntimeError( raise RuntimeError("Only one concurrent request is supported")
f"Unknown model: {request.model}, Run `llama model list`"
)
elif model.descriptor() != self.model.descriptor():
raise RuntimeError(
f"Model mismatch: {request.model} != {self.model.descriptor()}"
)
if SEMAPHORE.locked():
raise RuntimeError("Only one concurrent request is supported")
if request.stream: if request.stream:
return self._stream_chat_completion(request) return self._stream_chat_completion(request)
else: else:
return self._nonstream_chat_completion(request) return await self._nonstream_chat_completion(request)
async def _nonstream_chat_completion( async def _nonstream_chat_completion(
self, request: ChatCompletionRequest self, request: ChatCompletionRequest
) -> ChatCompletionResponse: ) -> ChatCompletionResponse:
async with SEMAPHORE: def impl():
messages = chat_completion_request_to_messages(request)
tokens = [] tokens = []
logprobs = [] logprobs = []
stop_reason = None stop_reason = None
for token_result in self.generator.chat_completion( for token_result in self.generator.chat_completion(request):
messages=messages,
temperature=request.sampling_params.temperature,
top_p=request.sampling_params.top_p,
max_gen_len=request.sampling_params.max_tokens,
logprobs=request.logprobs,
tool_prompt_format=request.tool_prompt_format,
):
tokens.append(token_result.token) tokens.append(token_result.token)
if token_result.text == "<|eot_id|>": if token_result.text == "<|eot_id|>":
@ -154,12 +255,16 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
logprobs=logprobs if request.logprobs else None, logprobs=logprobs if request.logprobs else None,
) )
if self.config.create_distributed_process_group:
async with SEMAPHORE:
return impl()
else:
return impl()
async def _stream_chat_completion( async def _stream_chat_completion(
self, request: ChatCompletionRequest self, request: ChatCompletionRequest
) -> AsyncGenerator: ) -> AsyncGenerator:
async with SEMAPHORE: def impl():
messages = chat_completion_request_to_messages(request)
yield ChatCompletionResponseStreamChunk( yield ChatCompletionResponseStreamChunk(
event=ChatCompletionResponseEvent( event=ChatCompletionResponseEvent(
event_type=ChatCompletionResponseEventType.start, event_type=ChatCompletionResponseEventType.start,
@ -172,14 +277,7 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
stop_reason = None stop_reason = None
ipython = False ipython = False
for token_result in self.generator.chat_completion( for token_result in self.generator.chat_completion(request):
messages=messages,
temperature=request.sampling_params.temperature,
top_p=request.sampling_params.top_p,
max_gen_len=request.sampling_params.max_tokens,
logprobs=request.logprobs,
tool_prompt_format=request.tool_prompt_format,
):
tokens.append(token_result.token) tokens.append(token_result.token)
if not ipython and token_result.text.startswith("<|python_tag|>"): if not ipython and token_result.text.startswith("<|python_tag|>"):
@ -272,6 +370,14 @@ class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
) )
) )
if self.config.create_distributed_process_group:
async with SEMAPHORE:
for x in impl():
yield x
else:
for x in impl():
yield x
async def embeddings( async def embeddings(
self, self,
model: str, model: str,

View file

@ -7,16 +7,17 @@
import os import os
from copy import deepcopy from copy import deepcopy
from functools import partial from functools import partial
from typing import Generator, List, Optional from typing import Any, Generator
from llama_models.llama3.api.chat_format import ChatFormat from llama_models.llama3.api.chat_format import ChatFormat
from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
from llama_models.llama3.api.tokenizer import Tokenizer from llama_models.llama3.api.tokenizer import Tokenizer
from llama_models.sku_list import resolve_model from llama_models.sku_list import resolve_model
from llama_stack.apis.inference import ChatCompletionRequest, CompletionRequest
from .config import MetaReferenceInferenceConfig from .config import MetaReferenceInferenceConfig
from .generation import Llama, model_checkpoint_dir from .generation import Llama, model_checkpoint_dir
from .parallel_utils import InferenceArgs, ModelParallelProcessGroup from .parallel_utils import ModelParallelProcessGroup
class ModelRunner: class ModelRunner:
@ -24,15 +25,13 @@ class ModelRunner:
self.llama = llama self.llama = llama
# the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()` # the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
def __call__(self, task: InferenceArgs): def __call__(self, req: Any):
return self.llama.chat_completion( if isinstance(req, ChatCompletionRequest):
task.messages, return self.llama.chat_completion(req)
task.temperature, elif isinstance(req, CompletionRequest):
task.top_p, return self.llama.completion(req)
task.max_gen_len, else:
task.logprobs, raise ValueError(f"Unexpected task type {type(req)}")
task.tool_prompt_format,
)
def init_model_cb(config: MetaReferenceInferenceConfig): def init_model_cb(config: MetaReferenceInferenceConfig):
@ -77,23 +76,18 @@ class LlamaModelParallelGenerator:
def __exit__(self, exc_type, exc_value, exc_traceback): def __exit__(self, exc_type, exc_value, exc_traceback):
self.group.stop() self.group.stop()
def chat_completion( def completion(
self, self,
messages: List[Message], request: CompletionRequest,
temperature: float = 0.6,
top_p: float = 0.9,
max_gen_len: Optional[int] = None,
logprobs: bool = False,
tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json,
) -> Generator: ) -> Generator:
req_obj = InferenceArgs( req_obj = deepcopy(request)
messages=deepcopy(messages), gen = self.group.run_inference(req_obj)
temperature=temperature, yield from gen
top_p=top_p,
max_gen_len=max_gen_len, def chat_completion(
logprobs=logprobs or False, self,
tool_prompt_format=tool_prompt_format, request: ChatCompletionRequest,
) ) -> Generator:
req_obj = deepcopy(request)
gen = self.group.run_inference(req_obj) gen = self.group.run_inference(req_obj)
yield from gen yield from gen

View file

@ -4,6 +4,12 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
# Copyright (c) Meta Platforms, IAny, nc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import json import json
import multiprocessing import multiprocessing
import os import os
@ -11,10 +17,9 @@ import tempfile
import time import time
import uuid import uuid
from enum import Enum from enum import Enum
from typing import Callable, Generator, List, Literal, Optional, Union from typing import Callable, Generator, Literal, Optional, Union
import torch import torch
import zmq import zmq
from fairscale.nn.model_parallel.initialize import ( from fairscale.nn.model_parallel.initialize import (
@ -23,25 +28,16 @@ from fairscale.nn.model_parallel.initialize import (
get_model_parallel_src_rank, get_model_parallel_src_rank,
) )
from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from torch.distributed.launcher.api import elastic_launch, LaunchConfig from torch.distributed.launcher.api import elastic_launch, LaunchConfig
from typing_extensions import Annotated from typing_extensions import Annotated
from llama_stack.apis.inference import ChatCompletionRequest, CompletionRequest
from .generation import TokenResult from .generation import TokenResult
class InferenceArgs(BaseModel):
messages: List[Message]
temperature: float
top_p: float
max_gen_len: int
logprobs: bool
tool_prompt_format: ToolPromptFormat
class ProcessingMessageName(str, Enum): class ProcessingMessageName(str, Enum):
ready_request = "ready_request" ready_request = "ready_request"
ready_response = "ready_response" ready_response = "ready_response"
@ -80,7 +76,7 @@ class TaskRequest(BaseModel):
type: Literal[ProcessingMessageName.task_request] = ( type: Literal[ProcessingMessageName.task_request] = (
ProcessingMessageName.task_request ProcessingMessageName.task_request
) )
task: InferenceArgs task: Union[CompletionRequest, ChatCompletionRequest]
class TaskResponse(BaseModel): class TaskResponse(BaseModel):
@ -349,11 +345,13 @@ class ModelParallelProcessGroup:
self.process.join() self.process.join()
self.started = False self.started = False
def run_inference(self, inference_args: InferenceArgs) -> Generator: def run_inference(
self, req: Union[CompletionRequest, ChatCompletionRequest]
) -> Generator:
assert not self.running, "inference already running" assert not self.running, "inference already running"
self.running = True self.running = True
self.request_socket.send(encode_msg(TaskRequest(task=inference_args))) self.request_socket.send(encode_msg(TaskRequest(task=req)))
try: try:
while True: while True:
obj_json = self.request_socket.recv() obj_json = self.request_socket.recv()

View file

@ -13,9 +13,10 @@ from typing import Optional
import torch import torch
from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_region
from llama_models.datatypes import CheckpointQuantizationFormat from llama_models.datatypes import CheckpointQuantizationFormat
from llama_models.llama3.reference_impl.model import Transformer, TransformerBlock from llama_models.llama3.reference_impl.model import Transformer, TransformerBlock
from llama_models.sku_list import resolve_model
from termcolor import cprint from termcolor import cprint
from torch import Tensor from torch import Tensor
@ -39,6 +40,7 @@ def swiglu_wrapper(
def convert_to_quantized_model( def convert_to_quantized_model(
model: Transformer, model: Transformer,
config: MetaReferenceQuantizedInferenceConfig, config: MetaReferenceQuantizedInferenceConfig,
checkpoint_dir: str,
fp8_activation_scale_ub: Optional[float] = 1200.0, fp8_activation_scale_ub: Optional[float] = 1200.0,
) -> Transformer: ) -> Transformer:
if config.quantization.type == QuantizationType.bf16.value: if config.quantization.type == QuantizationType.bf16.value:
@ -49,12 +51,14 @@ def convert_to_quantized_model(
from .fp8_impls import Fp8ScaledWeights, load_fp8, quantize_fp8 from .fp8_impls import Fp8ScaledWeights, load_fp8, quantize_fp8
checkpoint = config.checkpoint_config.checkpoint llama_model = resolve_model(config.model)
assert llama_model is not None, f"Model {config.model} not found"
# Move weights to GPU with quantization # Move weights to GPU with quantization
if checkpoint.quantization_format == CheckpointQuantizationFormat.fp8_mixed.value: if llama_model.quantization_format == CheckpointQuantizationFormat.fp8_mixed.value:
cprint("Loading fp8 scales...", "yellow") cprint("Loading fp8 scales...", "yellow")
fp8_scales_path = os.path.join( fp8_scales_path = os.path.join(
checkpoint.checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt" checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt"
) )
assert os.path.isfile( assert os.path.isfile(
fp8_scales_path fp8_scales_path

View file

@ -170,7 +170,7 @@ class LlamaGuardShield(ShieldBase):
for i in range(1, len(messages)): for i in range(1, len(messages)):
if messages[i].role == messages[i - 1].role: if messages[i].role == messages[i - 1].role:
raise ValueError( raise ValueError(
f"Messages must alternate between user and assistant. Message {i} has the same role as message {i-1}" f"Messages must alternate between user and assistant. Message {i} has the same role as message {i - 1}"
) )
return messages return messages
@ -184,7 +184,7 @@ class LlamaGuardShield(ShieldBase):
# TODO: llama-stack inference protocol has issues with non-streaming inference code # TODO: llama-stack inference protocol has issues with non-streaming inference code
content = "" content = ""
async for chunk in self.inference_api.chat_completion( async for chunk in await self.inference_api.chat_completion(
model=self.model, model=self.model,
messages=[shield_input_message], messages=[shield_input_message],
stream=True, stream=True,

View file

@ -1,3 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any from typing import Any
from .config import VLLMConfig from .config import VLLMConfig

View file

@ -134,7 +134,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
if self.engine: if self.engine:
self.engine.shutdown_background_loop() self.engine.shutdown_background_loop()
def completion( async def completion(
self, self,
model: str, model: str,
content: InterleavedTextMedia, content: InterleavedTextMedia,
@ -152,7 +152,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
logprobs=logprobs, logprobs=logprobs,
) )
def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: list[Message], messages: list[Message],
@ -189,7 +189,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
if stream: if stream:
return self._stream_chat_completion(request, results_generator) return self._stream_chat_completion(request, results_generator)
else: else:
return self._nonstream_chat_completion(request, results_generator) return await self._nonstream_chat_completion(request, results_generator)
async def _nonstream_chat_completion( async def _nonstream_chat_completion(
self, request: ChatCompletionRequest, results_generator: AsyncGenerator self, request: ChatCompletionRequest, results_generator: AsyncGenerator
@ -207,7 +207,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
response = OpenAICompatCompletionResponse( response = OpenAICompatCompletionResponse(
choices=[choice], choices=[choice],
) )
return process_chat_completion_response(request, response, self.formatter) return process_chat_completion_response(response, self.formatter)
async def _stream_chat_completion( async def _stream_chat_completion(
self, request: ChatCompletionRequest, results_generator: AsyncGenerator self, request: ChatCompletionRequest, results_generator: AsyncGenerator
@ -229,7 +229,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
stream = _generate_and_convert_to_openai_compat() stream = _generate_and_convert_to_openai_compat()
async for chunk in process_chat_completion_stream_response( async for chunk in process_chat_completion_stream_response(
request, stream, self.formatter stream, self.formatter
): ):
yield chunk yield chunk

View file

@ -55,11 +55,20 @@ def available_providers() -> List[ProviderSpec]:
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(
adapter_type="ollama", adapter_type="ollama",
pip_packages=["ollama"], pip_packages=["ollama", "aiohttp"],
config_class="llama_stack.providers.adapters.inference.ollama.OllamaImplConfig", config_class="llama_stack.providers.adapters.inference.ollama.OllamaImplConfig",
module="llama_stack.providers.adapters.inference.ollama", module="llama_stack.providers.adapters.inference.ollama",
), ),
), ),
# remote_provider_spec(
# api=Api.inference,
# adapter=AdapterSpec(
# adapter_type="vllm",
# pip_packages=["openai"],
# module="llama_stack.providers.adapters.inference.vllm",
# config_class="llama_stack.providers.adapters.inference.vllm.VLLMImplConfig",
# ),
# ),
remote_provider_spec( remote_provider_spec(
api=Api.inference, api=Api.inference,
adapter=AdapterSpec( adapter=AdapterSpec(

View file

@ -31,4 +31,4 @@ providers:
persistence_store: persistence_store:
namespace: null namespace: null
type: sqlite type: sqlite
db_path: /Users/ashwin/.llama/runtime/kvstore.db db_path: ~/.llama/runtime/kvstore.db

View file

@ -64,6 +64,24 @@ def search_query_messages():
] ]
@pytest.fixture
def attachment_message():
return [
UserMessage(
content="I am attaching some documentation for Torchtune. Help me answer questions I will ask next.",
),
]
@pytest.fixture
def query_attachment_messages():
return [
UserMessage(
content="What are the top 5 topics that were explained? Only list succinct bullet points."
),
]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_create_agent_turn(agents_settings, sample_messages): async def test_create_agent_turn(agents_settings, sample_messages):
agents_impl = agents_settings["impl"] agents_impl = agents_settings["impl"]
@ -98,7 +116,7 @@ async def test_create_agent_turn(agents_settings, sample_messages):
) )
turn_response = [ turn_response = [
chunk async for chunk in agents_impl.create_agent_turn(**turn_request) chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
] ]
assert len(turn_response) > 0 assert len(turn_response) > 0
@ -123,6 +141,89 @@ async def test_create_agent_turn(agents_settings, sample_messages):
assert len(final_event.turn.output_message.content) > 0 assert len(final_event.turn.output_message.content) > 0
@pytest.mark.asyncio
async def test_rag_agent_as_attachments(
agents_settings, attachment_message, query_attachment_messages
):
urls = [
"memory_optimizations.rst",
"chat.rst",
"llama3.rst",
"datasets.rst",
"qat_finetune.rst",
"lora_finetune.rst",
]
attachments = [
Attachment(
content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
mime_type="text/plain",
)
for i, url in enumerate(urls)
]
agents_impl = agents_settings["impl"]
agent_config = AgentConfig(
model=agents_settings["common_params"]["model"],
instructions=agents_settings["common_params"]["instructions"],
enable_session_persistence=True,
sampling_params=SamplingParams(temperature=0.7, top_p=0.95),
input_shields=[],
output_shields=[],
tools=[
MemoryToolDefinition(
memory_bank_configs=[],
query_generator_config={
"type": "default",
"sep": " ",
},
max_tokens_in_context=4096,
max_chunks=10,
),
],
max_infer_iters=5,
)
create_response = await agents_impl.create_agent(agent_config)
agent_id = create_response.agent_id
# Create a session
session_create_response = await agents_impl.create_agent_session(
agent_id, "Test Session"
)
session_id = session_create_response.session_id
# Create and execute a turn
turn_request = dict(
agent_id=agent_id,
session_id=session_id,
messages=attachment_message,
attachments=attachments,
stream=True,
)
turn_response = [
chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
]
assert len(turn_response) > 0
# Create a second turn querying the agent
turn_request = dict(
agent_id=agent_id,
session_id=session_id,
messages=query_attachment_messages,
stream=True,
)
turn_response = [
chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
]
assert len(turn_response) > 0
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_create_agent_turn_with_brave_search( async def test_create_agent_turn_with_brave_search(
agents_settings, search_query_messages agents_settings, search_query_messages
@ -169,7 +270,7 @@ async def test_create_agent_turn_with_brave_search(
) )
turn_response = [ turn_response = [
chunk async for chunk in agents_impl.create_agent_turn(**turn_request) chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
] ]
assert len(turn_response) > 0 assert len(turn_response) > 0

View file

@ -4,6 +4,10 @@ providers:
config: config:
host: localhost host: localhost
port: 11434 port: 11434
- provider_id: meta-reference
provider_type: meta-reference
config:
model: Llama3.2-1B-Instruct
- provider_id: test-tgi - provider_id: test-tgi
provider_type: remote::tgi provider_type: remote::tgi
config: config:

View file

@ -5,6 +5,7 @@
# the root directory of this source tree. # the root directory of this source tree.
import itertools import itertools
import os
import pytest import pytest
import pytest_asyncio import pytest_asyncio
@ -50,14 +51,17 @@ def get_expected_stop_reason(model: str):
return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn
if "MODEL_IDS" not in os.environ:
MODEL_IDS = [Llama_8B, Llama_3B]
else:
MODEL_IDS = os.environ["MODEL_IDS"].split(",")
# This is going to create multiple Stack impls without tearing down the previous one # This is going to create multiple Stack impls without tearing down the previous one
# Fix that! # Fix that!
@pytest_asyncio.fixture( @pytest_asyncio.fixture(
scope="session", scope="session",
params=[ params=[{"model": m} for m in MODEL_IDS],
{"model": Llama_8B},
{"model": Llama_3B},
],
ids=lambda d: d["model"], ids=lambda d: d["model"],
) )
async def inference_settings(request): async def inference_settings(request):
@ -122,6 +126,48 @@ async def test_model_list(inference_settings):
assert model_def.identifier == params["model"] assert model_def.identifier == params["model"]
@pytest.mark.asyncio
async def test_completion(inference_settings):
inference_impl = inference_settings["impl"]
params = inference_settings["common_params"]
provider = inference_impl.routing_table.get_provider_impl(params["model"])
if provider.__provider_spec__.provider_type not in (
"meta-reference",
"remote::ollama",
):
pytest.skip("Other inference providers don't support completion() yet")
response = await inference_impl.completion(
content="Roses are red,",
stream=False,
model=params["model"],
sampling_params=SamplingParams(
max_tokens=50,
),
)
assert isinstance(response, CompletionResponse)
assert "violets are blue" in response.content
chunks = [
r
async for r in await inference_impl.completion(
content="Roses are red,",
stream=True,
model=params["model"],
sampling_params=SamplingParams(
max_tokens=50,
),
)
]
assert all(isinstance(chunk, CompletionResponseStreamChunk) for chunk in chunks)
assert len(chunks) == 51
last = chunks[-1]
assert last.stop_reason == StopReason.out_of_tokens
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_chat_completion_non_streaming(inference_settings, sample_messages): async def test_chat_completion_non_streaming(inference_settings, sample_messages):
inference_impl = inference_settings["impl"] inference_impl = inference_settings["impl"]
@ -142,7 +188,7 @@ async def test_chat_completion_streaming(inference_settings, sample_messages):
inference_impl = inference_settings["impl"] inference_impl = inference_settings["impl"]
response = [ response = [
r r
async for r in inference_impl.chat_completion( async for r in await inference_impl.chat_completion(
messages=sample_messages, messages=sample_messages,
stream=True, stream=True,
**inference_settings["common_params"], **inference_settings["common_params"],
@ -213,7 +259,7 @@ async def test_chat_completion_with_tool_calling_streaming(
response = [ response = [
r r
async for r in inference_impl.chat_completion( async for r in await inference_impl.chat_completion(
messages=messages, messages=messages,
tools=[sample_tool_definition], tools=[sample_tool_definition],
stream=True, stream=True,

View file

@ -2,8 +2,8 @@ providers:
- provider_id: test-faiss - provider_id: test-faiss
provider_type: meta-reference provider_type: meta-reference
config: {} config: {}
- provider_id: test-chroma - provider_id: test-chromadb
provider_type: remote::chroma provider_type: remote::chromadb
config: config:
host: localhost host: localhost
port: 6001 port: 6001

View file

@ -89,6 +89,30 @@ async def test_banks_list(memory_settings):
assert len(response) == 0 assert len(response) == 0
@pytest.mark.asyncio
async def test_banks_register(memory_settings):
# NOTE: this needs you to ensure that you are starting from a clean state
# but so far we don't have an unregister API unfortunately, so be careful
banks_impl = memory_settings["memory_banks_impl"]
bank = VectorMemoryBankDef(
identifier="test_bank_no_provider",
embedding_model="all-MiniLM-L6-v2",
chunk_size_in_tokens=512,
overlap_size_in_tokens=64,
)
await banks_impl.register_memory_bank(bank)
response = await banks_impl.list_memory_banks()
assert isinstance(response, list)
assert len(response) == 1
# register same memory bank with same id again will fail
await banks_impl.register_memory_bank(bank)
response = await banks_impl.list_memory_banks()
assert isinstance(response, list)
assert len(response) == 1
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_query_documents(memory_settings, sample_documents): async def test_query_documents(memory_settings, sample_documents):
memory_impl = memory_settings["memory_impl"] memory_impl = memory_settings["memory_impl"]

View file

@ -14,7 +14,7 @@ import yaml
from llama_stack.distribution.datatypes import * # noqa: F403 from llama_stack.distribution.datatypes import * # noqa: F403
from llama_stack.distribution.configure import parse_and_maybe_upgrade_config from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
from llama_stack.distribution.request_headers import set_request_provider_data from llama_stack.distribution.request_headers import set_request_provider_data
from llama_stack.distribution.resolver import resolve_impls_with_routing from llama_stack.distribution.resolver import resolve_impls
async def resolve_impls_for_test(api: Api, deps: List[Api] = None): async def resolve_impls_for_test(api: Api, deps: List[Api] = None):
@ -36,7 +36,7 @@ async def resolve_impls_for_test(api: Api, deps: List[Api] = None):
providers=chosen, providers=chosen,
) )
run_config = parse_and_maybe_upgrade_config(run_config) run_config = parse_and_maybe_upgrade_config(run_config)
impls = await resolve_impls_with_routing(run_config) impls = await resolve_impls(run_config)
if "provider_data" in config_dict: if "provider_data" in config_dict:
provider_id = chosen[api.value][0].provider_id provider_id = chosen[api.value][0].provider_id

View file

@ -34,6 +34,8 @@ def get_sampling_options(request: ChatCompletionRequest) -> dict:
if params := request.sampling_params: if params := request.sampling_params:
for attr in {"temperature", "top_p", "top_k", "max_tokens"}: for attr in {"temperature", "top_p", "top_k", "max_tokens"}:
if getattr(params, attr): if getattr(params, attr):
if attr == "max_tokens":
options["num_predict"] = getattr(params, attr)
options[attr] = getattr(params, attr) options[attr] = getattr(params, attr)
if params.repetition_penalty is not None and params.repetition_penalty != 1.0: if params.repetition_penalty is not None and params.repetition_penalty != 1.0:
@ -49,27 +51,35 @@ def text_from_choice(choice) -> str:
return choice.text return choice.text
def get_stop_reason(finish_reason: str) -> StopReason:
if finish_reason in ["stop", "eos"]:
return StopReason.end_of_turn
elif finish_reason == "eom":
return StopReason.end_of_message
elif finish_reason == "length":
return StopReason.out_of_tokens
return StopReason.out_of_tokens
def process_completion_response(
response: OpenAICompatCompletionResponse, formatter: ChatFormat
) -> CompletionResponse:
choice = response.choices[0]
return CompletionResponse(
stop_reason=get_stop_reason(choice.finish_reason),
content=choice.text,
)
def process_chat_completion_response( def process_chat_completion_response(
request: ChatCompletionRequest, response: OpenAICompatCompletionResponse, formatter: ChatFormat
response: OpenAICompatCompletionResponse,
formatter: ChatFormat,
) -> ChatCompletionResponse: ) -> ChatCompletionResponse:
choice = response.choices[0] choice = response.choices[0]
stop_reason = None
if reason := choice.finish_reason:
if reason in ["stop", "eos"]:
stop_reason = StopReason.end_of_turn
elif reason == "eom":
stop_reason = StopReason.end_of_message
elif reason == "length":
stop_reason = StopReason.out_of_tokens
if stop_reason is None:
stop_reason = StopReason.out_of_tokens
completion_message = formatter.decode_assistant_message_from_content( completion_message = formatter.decode_assistant_message_from_content(
text_from_choice(choice), stop_reason text_from_choice(choice), get_stop_reason(choice.finish_reason)
) )
return ChatCompletionResponse( return ChatCompletionResponse(
completion_message=completion_message, completion_message=completion_message,
@ -77,10 +87,45 @@ def process_chat_completion_response(
) )
async def process_completion_stream_response(
stream: AsyncGenerator[OpenAICompatCompletionResponse, None], formatter: ChatFormat
) -> AsyncGenerator:
stop_reason = None
async for chunk in stream:
choice = chunk.choices[0]
finish_reason = choice.finish_reason
if finish_reason:
if finish_reason in ["stop", "eos", "eos_token"]:
stop_reason = StopReason.end_of_turn
elif finish_reason == "length":
stop_reason = StopReason.out_of_tokens
break
text = text_from_choice(choice)
if text == "<|eot_id|>":
stop_reason = StopReason.end_of_turn
text = ""
continue
elif text == "<|eom_id|>":
stop_reason = StopReason.end_of_message
text = ""
continue
yield CompletionResponseStreamChunk(
delta=text,
stop_reason=stop_reason,
)
yield CompletionResponseStreamChunk(
delta="",
stop_reason=stop_reason,
)
async def process_chat_completion_stream_response( async def process_chat_completion_stream_response(
request: ChatCompletionRequest, stream: AsyncGenerator[OpenAICompatCompletionResponse, None], formatter: ChatFormat
stream: AsyncGenerator[OpenAICompatCompletionResponse, None],
formatter: ChatFormat,
) -> AsyncGenerator: ) -> AsyncGenerator:
yield ChatCompletionResponseStreamChunk( yield ChatCompletionResponseStreamChunk(
event=ChatCompletionResponseEvent( event=ChatCompletionResponseEvent(

View file

@ -23,6 +23,13 @@ from llama_models.sku_list import resolve_model
from llama_stack.providers.utils.inference import supported_inference_models from llama_stack.providers.utils.inference import supported_inference_models
def completion_request_to_prompt(
request: CompletionRequest, formatter: ChatFormat
) -> str:
model_input = formatter.encode_content(request.content)
return formatter.tokenizer.decode(model_input.tokens)
def chat_completion_request_to_prompt( def chat_completion_request_to_prompt(
request: ChatCompletionRequest, formatter: ChatFormat request: ChatCompletionRequest, formatter: ChatFormat
) -> str: ) -> str:

View file

@ -152,7 +152,7 @@ def severity(levelname: str) -> LogSeverity:
elif levelname == "INFO": elif levelname == "INFO":
return LogSeverity.INFO return LogSeverity.INFO
elif levelname == "WARNING": elif levelname == "WARNING":
return LogSeverity.WARNING return LogSeverity.WARN
elif levelname == "ERROR": elif levelname == "ERROR":
return LogSeverity.ERROR return LogSeverity.ERROR
elif levelname == "CRITICAL": elif levelname == "CRITICAL":

View file

@ -2,7 +2,7 @@ blobfile
fire fire
httpx httpx
huggingface-hub huggingface-hub
llama-models>=0.0.41 llama-models>=0.0.43
prompt-toolkit prompt-toolkit
python-dotenv python-dotenv
pydantic>=2 pydantic>=2

Some files were not shown because too many files have changed in this diff Show more