Add ollama/pull-models.sh

This commit is contained in:
Ashwin Bharambe 2024-11-18 10:57:20 -08:00
parent fa1d29cfdc
commit 1ecaf2cb3c
16 changed files with 305 additions and 289 deletions

View file

@ -1,30 +1,71 @@
services: services:
ollama: ollama:
image: ollama/ollama:latest image: ollama/ollama:latest
network_mode: "host" network_mode: ${NETWORK_MODE:-bridge}
volumes: volumes:
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast - ~/.ollama:/root/.ollama
ports: ports:
- "11434:11434" - "11434:11434"
environment:
OLLAMA_DEBUG: 1
command: [] command: []
deploy:
resources:
limits:
memory: 8G # Set maximum memory
reservations:
memory: 8G # Set minimum memory reservation
# healthcheck:
# # ugh, no CURL in ollama image
# test: ["CMD", "curl", "-f", "http://ollama:11434"]
# interval: 10s
# timeout: 5s
# retries: 5
ollama-init:
image: ollama/ollama:latest
depends_on:
- ollama
# condition: service_healthy
network_mode: ${NETWORK_MODE:-bridge}
environment:
- OLLAMA_HOST=ollama
- INFERENCE_MODEL=${INFERENCE_MODEL}
- SAFETY_MODEL=${SAFETY_MODEL:-}
volumes:
- ~/.ollama:/root/.ollama
- ./pull-models.sh:/pull-models.sh
entrypoint: ["/pull-models.sh"]
llamastack: llamastack:
depends_on: depends_on:
- ollama ollama:
image: llamastack/distribution-ollama condition: service_started
network_mode: "host" ollama-init:
condition: service_started
image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
network_mode: ${NETWORK_MODE:-bridge}
volumes: volumes:
- ~/.llama:/root/.llama - ~/.llama:/root/.llama
# Link to ollama run.yaml file # Link to ollama run.yaml file
- ./run.yaml:/root/my-run.yaml - ~/local/llama-stack/:/app/llama-stack-source
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
ports: ports:
- "5000:5000" - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
# Hack: wait for ollama server to start before starting docker environment:
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - INFERENCE_MODEL=${INFERENCE_MODEL}
- SAFETY_MODEL=${SAFETY_MODEL:-}
- OLLAMA_URL=http://ollama:11434
entrypoint: >
python -m llama_stack.distribution.server.server /root/my-run.yaml \
--port ${LLAMA_STACK_PORT:-5001}
deploy: deploy:
restart_policy: restart_policy:
condition: on-failure condition: on-failure
delay: 3s delay: 10s
max_attempts: 5 max_attempts: 3
window: 60s window: 60s
volumes: volumes:
ollama: ollama:
ollama-init:
llamastack:

View file

@ -0,0 +1,18 @@
#!/bin/sh
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
echo "Preloading $model..."
if ! ollama run "$model"; then
echo "Failed to pull and run $model"
exit 1
fi
done
echo "All models pulled successfully"

View file

@ -1,13 +1,12 @@
version: '2' version: '2'
built_at: 2024-11-17 19:33:00
image_name: ollama image_name: ollama
docker_image: null docker_image: null
conda_env: null conda_env: null
apis: apis:
- memory
- agents - agents
- safety
- inference - inference
- memory
- safety
- telemetry - telemetry
providers: providers:
inference: inference:
@ -46,11 +45,11 @@ metadata_store:
models: models:
- metadata: {} - metadata: {}
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: ollama-inference provider_id: ollama
provider_model_id: null provider_model_id: null
- metadata: {} - metadata: {}
model_id: ${env.SAFETY_MODEL} model_id: ${env.SAFETY_MODEL}
provider_id: ollama-safety provider_id: ollama
provider_model_id: null provider_model_id: null
shields: shields:
- params: null - params: null

View file

@ -3,10 +3,10 @@ image_name: ollama
docker_image: null docker_image: null
conda_env: null conda_env: null
apis: apis:
- memory
- agents - agents
- safety
- inference - inference
- memory
- safety
- telemetry - telemetry
providers: providers:
inference: inference:
@ -45,7 +45,7 @@ metadata_store:
models: models:
- metadata: {} - metadata: {}
model_id: ${env.INFERENCE_MODEL} model_id: ${env.INFERENCE_MODEL}
provider_id: ollama-inference provider_id: ollama
provider_model_id: null provider_model_id: null
shields: [] shields: []
memory_banks: [] memory_banks: []

View file

@ -1,13 +1,12 @@
version: '2' version: '2'
built_at: 2024-11-17 19:33:00
image_name: remote-vllm image_name: remote-vllm
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
conda_env: null conda_env: null
apis: apis:
- memory
- agents - agents
- safety
- inference - inference
- memory
- safety
- telemetry - telemetry
providers: providers:
inference: inference:

View file

@ -3,10 +3,10 @@ image_name: remote-vllm
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
conda_env: null conda_env: null
apis: apis:
- memory
- agents - agents
- safety
- inference - inference
- memory
- safety
- telemetry - telemetry
providers: providers:
inference: inference:

View file

@ -1,13 +1,12 @@
version: '2' version: '2'
built_at: 2024-11-17 19:33:00
image_name: tgi image_name: tgi
docker_image: llamastack/distribution-tgi:test-0.0.52rc3 docker_image: llamastack/distribution-tgi:test-0.0.52rc3
conda_env: null conda_env: null
apis: apis:
- memory
- agents - agents
- safety
- inference - inference
- memory
- safety
- telemetry - telemetry
providers: providers:
inference: inference:
@ -18,7 +17,7 @@ providers:
- provider_id: tgi-safety - provider_id: tgi-safety
provider_type: remote::tgi provider_type: remote::tgi
config: config:
url: ${env.SAFETY_TGI_URL} url: ${env.TGI_SAFETY_URL}
memory: memory:
- provider_id: faiss - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss

View file

@ -3,10 +3,10 @@ image_name: tgi
docker_image: llamastack/distribution-tgi:test-0.0.52rc3 docker_image: llamastack/distribution-tgi:test-0.0.52rc3
conda_env: null conda_env: null
apis: apis:
- memory
- agents - agents
- safety
- inference - inference
- memory
- safety
- telemetry - telemetry
providers: providers:
inference: inference:

View file

@ -2,27 +2,16 @@
The `llamastack/distribution-ollama` distribution consists of the following provider configurations. The `llamastack/distribution-ollama` distribution consists of the following provider configurations.
Provider Configuration | API | Provider(s) |
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ |-----|-------------|
┃ API ┃ Provider(s) ┃ | agents | `inline::meta-reference` |
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ | inference | `remote::ollama` |
│ agents │ `inline::meta-reference` | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
│ inference │ `remote::ollama` | safety | `inline::llama-guard` |
│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` | telemetry | `inline::meta-reference` |
│ safety │ `inline::llama-guard`
│ telemetry │ `inline::meta-reference`
└───────────┴─────────────────────────────────────────────────────────┘
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Models
The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `OLLAMA_URL`: URL of the Ollama server (default: `http://host.docker.internal:11434`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
### Models
The following models are configured by default: The following models are configured by default:
- `${env.INFERENCE_MODEL}` - `${env.INFERENCE_MODEL}`

View file

@ -2,29 +2,16 @@
The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations: The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations:
Provider Configuration | API | Provider(s) |
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ |-----|-------------|
┃ API ┃ Provider(s) ┃ | agents | `inline::meta-reference` |
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ | inference | `remote::vllm` |
│ agents │ `inline::meta-reference` | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
│ inference │ `remote::vllm` | safety | `inline::llama-guard` |
│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` | telemetry | `inline::meta-reference` |
│ safety │ `inline::llama-guard`
│ telemetry │ `inline::meta-reference`
└───────────┴─────────────────────────────────────────────────────────┘
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.### Environment Variables You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100}/v1`)
- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`)
- `SAFETY_VLLM_URL`: URL of the vLLM server with the safety model (default: `http://host.docker.internal:5101/v1`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
### Models ### Models
The following models are configured by default: The following models are configured by default:

View file

@ -2,128 +2,127 @@
The `llamastack/distribution-tgi` distribution consists of the following provider configurations. The `llamastack/distribution-tgi` distribution consists of the following provider configurations.
Provider Configuration | API | Provider(s) |
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ |-----|-------------|
┃ API ┃ Provider(s) ┃ | agents | `inline::meta-reference` |
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ | inference | `remote::tgi` |
│ agents │ `inline::meta-reference` | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
│ inference │ `remote::tgi` | safety | `inline::llama-guard` |
│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` | telemetry | `inline::meta-reference` |
│ safety │ `inline::llama-guard`
│ telemetry │ `inline::meta-reference`
└───────────┴─────────────────────────────────────────────────────────┘
You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference.### Environment Variables You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference.
### Environment Variables
The following environment variables can be configured: The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`) - `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://host.docker.internal:8080}/v1`) - `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080}/v1`)
- `SAFETY_TGI_URL`: URL of the TGI server with the safety model (default: `http://host.docker.internal:8081/v1`) - `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) - `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
### Models
The following models are configured by default:
- `${env.INFERENCE_MODEL}`
- `${env.SAFETY_MODEL}`
## Using Docker Compose ## Setting up TGI server
You can use `docker compose` to start a TGI container and Llama Stack server container together. Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker:
```bash ```bash
$ cd distributions/tgi; docker compose up export TGI_INFERENCE_PORT=8080
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export CUDA_VISIBLE_DEVICES=0
docker run --rm -it \
-v $HOME/.cache/huggingface:/data \
-p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \
--gpus $CUDA_VISIBLE_DEVICES \
ghcr.io/huggingface/text-generation-inference:2.3.1 \
--dtype bfloat16 \
--usage-stats off \
--sharded false \
--cuda-memory-fraction 0.7 \
--model-id $INFERENCE_MODEL \
--port $TGI_INFERENCE_PORT
``` ```
The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs -- If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
```bash
[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama)
[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0
[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5001 (Press CTRL+C to quit)
```
To kill the server
```bash
docker compose down
```
### Conda: TGI server + llama stack run
If you wish to separately spin up a TGI server, and connect with Llama Stack, you may use the following commands.
#### Start TGI server locally
- Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint.
```bash ```bash
docker run --rm -it -v $HOME/.cache/huggingface:/data \ export TGI_SAFETY_PORT=8081
-p 5009:5009 --gpus all \ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
ghcr.io/huggingface/text-generation-inference:latest \ export CUDA_VISIBLE_DEVICES=1
--dtype bfloat16 --usage-stats on --sharded false \
--model-id meta-llama/Llama-3.2-3B-Instruct --port 5009 docker run --rm -it \
-v $HOME/.cache/huggingface:/data \
-p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \
--gpus $CUDA_VISIBLE_DEVICES \
ghcr.io/huggingface/text-generation-inference:2.3.1 \
--dtype bfloat16 \
--usage-stats off \
--sharded false \
--model-id $SAFETY_MODEL \
--port $TGI_SAFETY_PORT
``` ```
#### Start Llama Stack server pointing to TGI server ## Running Llama Stack with TGI as the inference provider
**Via Conda** Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
--network host \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-tgi \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
docker run \
--network host \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-tgi \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT
```
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
llama stack build --template tgi --image-type conda llama stack build --template tgi --image-type conda
# -- start a TGI server endpoint llama stack run ./run.yaml
llama stack run ./gpu/run.yaml --port 5001
--env INFERENCE_MODEL=$INFERENCE_MODEL
--env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
``` ```
**Via Docker** If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
docker run --network host -it -p 5001:5001 \ llama stack run ./run-with-safety.yaml
-v ./run.yaml:/root/my-run.yaml --gpus=all \ --port 5001
llamastack/distribution-tgi \ --env INFERENCE_MODEL=$INFERENCE_MODEL
--yaml_config /root/my-run.yaml --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
``` --env SAFETY_MODEL=$SAFETY_MODEL
--env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT
We have provided a template `run.yaml` file in the `distributions/tgi` directory. Make sure in your `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g.
```yaml
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009
```
### (Optional) Update Model Serving Configuration
To serve a new model with `tgi`, change the docker command flag `--model-id <model-to-serve>`.
This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve.
```yaml
command: >
--dtype bfloat16 --usage-stats on --sharded false
--model-id meta-llama/Llama-3.2-1B-Instruct
--port 5009 --cuda-memory-fraction 0.7
```
or by changing the docker run command's `--model-id` flag
```bash
docker run --rm -it -v $HOME/.cache/huggingface:/data \
-p 5009:5009 --gpus all \
ghcr.io/huggingface/text-generation-inference:latest \
--dtype bfloat16 --usage-stats off --sharded false \
--model-id meta-llama/Llama-3.2-3B-Instruct --port 5009
```
In `run.yaml`, make sure you point the correct server endpoint to the TGI server endpoint serving your model.
```yaml
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009
``` ```

View file

@ -21,7 +21,9 @@ def find_template_dirs(templates_dir: Path) -> Iterator[Path]:
if not templates_dir.exists(): if not templates_dir.exists():
raise FileNotFoundError(f"Templates directory not found: {templates_dir}") raise FileNotFoundError(f"Templates directory not found: {templates_dir}")
return (d for d in templates_dir.iterdir() if d.is_dir()) return (
d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__"
)
def process_template(template_dir: Path, progress) -> None: def process_template(template_dir: Path, progress) -> None:

View file

@ -28,11 +28,11 @@ def get_distribution_template() -> DistributionTemplate:
inference_model = ModelInput( inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}", model_id="${env.INFERENCE_MODEL}",
provider_id="ollama-inference", provider_id="ollama",
) )
safety_model = ModelInput( safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}", model_id="${env.SAFETY_MODEL}",
provider_id="ollama-safety", provider_id="ollama",
) )
return DistributionTemplate( return DistributionTemplate(

View file

@ -4,18 +4,13 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from io import StringIO
from pathlib import Path from pathlib import Path
from typing import Dict, List, Literal, Optional, Set, Tuple from typing import Dict, List, Literal, Optional, Tuple
import jinja2 import jinja2
import yaml import yaml
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from rich.console import Console
from rich.table import Table
from llama_stack.distribution.datatypes import ( from llama_stack.distribution.datatypes import (
Api, Api,
BuildConfig, BuildConfig,
@ -80,12 +75,12 @@ class RunConfigSettings(BaseModel):
] ]
# Get unique set of APIs from providers # Get unique set of APIs from providers
apis: Set[str] = set(providers.keys()) apis = list(sorted(providers.keys()))
return StackRunConfig( return StackRunConfig(
image_name=name, image_name=name,
docker_image=docker_image, docker_image=docker_image,
apis=list(apis), apis=apis,
providers=provider_configs, providers=provider_configs,
metadata_store=SqliteKVStoreConfig.sample_run_config( metadata_store=SqliteKVStoreConfig.sample_run_config(
__distro_dir__=f"distributions/{name}", __distro_dir__=f"distributions/{name}",
@ -111,7 +106,7 @@ class DistributionTemplate(BaseModel):
template_path: Path template_path: Path
# Optional configuration # Optional configuration
docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
docker_image: Optional[str] = None docker_image: Optional[str] = None
default_models: Optional[List[ModelInput]] = None default_models: Optional[List[ModelInput]] = None
@ -128,20 +123,12 @@ class DistributionTemplate(BaseModel):
) )
def generate_markdown_docs(self) -> str: def generate_markdown_docs(self) -> str:
"""Generate markdown documentation using both Jinja2 templates and rich tables.""" providers_table = "| API | Provider(s) |\n"
# First generate the providers table using rich providers_table += "|-----|-------------|\n"
output = StringIO()
console = Console(file=output, force_terminal=False)
table = Table(title="Provider Configuration", show_header=True)
table.add_column("API", style="bold")
table.add_column("Provider(s)")
for api, providers in sorted(self.providers.items()): for api, providers in sorted(self.providers.items()):
table.add_row(api, ", ".join(f"`{p}`" for p in providers)) providers_str = ", ".join(f"`{p}`" for p in providers)
providers_table += f"| {api} | {providers_str} |\n"
console.print(table)
providers_table = output.getvalue()
template = self.template_path.read_text() template = self.template_path.read_text()
# Render template with rich-generated table # Render template with rich-generated table
@ -152,7 +139,7 @@ class DistributionTemplate(BaseModel):
description=self.description, description=self.description,
providers=self.providers, providers=self.providers,
providers_table=providers_table, providers_table=providers_table,
docker_compose_env_vars=self.docker_compose_env_vars, run_config_env_vars=self.run_config_env_vars,
default_models=self.default_models, default_models=self.default_models,
) )

View file

@ -6,120 +6,116 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following
You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference. You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference.
{%- if docker_compose_env_vars %} {% if run_config_env_vars %}
### Environment Variables ### Environment Variables
The following environment variables can be configured: The following environment variables can be configured:
{% for var, (default_value, description) in docker_compose_env_vars.items() %} {% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
{% endfor %} {% endfor %}
{% endif %} {% endif %}
{%- if default_models %}
### Models
The following models are configured by default: ## Setting up TGI server
{% for model in default_models %}
- `{{ model.model_id }}`
{% endfor %}
{% endif %}
Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker:
## Using Docker Compose
You can use `docker compose` to start a TGI container and Llama Stack server container together.
```bash ```bash
$ cd distributions/{{ name }}; docker compose up export TGI_INFERENCE_PORT=8080
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export CUDA_VISIBLE_DEVICES=0
docker run --rm -it \
-v $HOME/.cache/huggingface:/data \
-p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \
--gpus $CUDA_VISIBLE_DEVICES \
ghcr.io/huggingface/text-generation-inference:2.3.1 \
--dtype bfloat16 \
--usage-stats off \
--sharded false \
--cuda-memory-fraction 0.7 \
--model-id $INFERENCE_MODEL \
--port $TGI_INFERENCE_PORT
``` ```
The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs -- If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
```bash
[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama)
[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0
[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5001 (Press CTRL+C to quit)
```
To kill the server
```bash
docker compose down
```
### Conda: TGI server + llama stack run
If you wish to separately spin up a TGI server, and connect with Llama Stack, you may use the following commands.
#### Start TGI server locally
- Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint.
```bash ```bash
docker run --rm -it -v $HOME/.cache/huggingface:/data \ export TGI_SAFETY_PORT=8081
-p 5009:5009 --gpus all \ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
ghcr.io/huggingface/text-generation-inference:latest \ export CUDA_VISIBLE_DEVICES=1
--dtype bfloat16 --usage-stats on --sharded false \
--model-id meta-llama/Llama-3.2-3B-Instruct --port 5009 docker run --rm -it \
-v $HOME/.cache/huggingface:/data \
-p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \
--gpus $CUDA_VISIBLE_DEVICES \
ghcr.io/huggingface/text-generation-inference:2.3.1 \
--dtype bfloat16 \
--usage-stats off \
--sharded false \
--model-id $SAFETY_MODEL \
--port $TGI_SAFETY_PORT
``` ```
#### Start Llama Stack server pointing to TGI server ## Running Llama Stack with TGI as the inference provider
**Via Conda** Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
--network host \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
docker run \
--network host \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT
```
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
llama stack build --template {{ name }} --image-type conda llama stack build --template {{ name }} --image-type conda
# -- start a TGI server endpoint llama stack run ./run.yaml
llama stack run ./gpu/run.yaml --port 5001
--env INFERENCE_MODEL=$INFERENCE_MODEL
--env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
``` ```
**Via Docker** If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
docker run --network host -it -p 5001:5001 \ llama stack run ./run-with-safety.yaml
-v ./run.yaml:/root/my-run.yaml --gpus=all \ --port 5001
llamastack/distribution-{{ name }} \ --env INFERENCE_MODEL=$INFERENCE_MODEL
--yaml_config /root/my-run.yaml --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
``` --env SAFETY_MODEL=$SAFETY_MODEL
--env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT
We have provided a template `run.yaml` file in the `distributions/{{ name }}` directory. Make sure in your `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g.
```yaml
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009
```
### (Optional) Update Model Serving Configuration
To serve a new model with `tgi`, change the docker command flag `--model-id <model-to-serve>`.
This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve.
```yaml
command: >
--dtype bfloat16 --usage-stats on --sharded false
--model-id meta-llama/Llama-3.2-1B-Instruct
--port 5009 --cuda-memory-fraction 0.7
```
or by changing the docker run command's `--model-id` flag
```bash
docker run --rm -it -v $HOME/.cache/huggingface:/data \
-p 5009:5009 --gpus all \
ghcr.io/huggingface/text-generation-inference:latest \
--dtype bfloat16 --usage-stats off --sharded false \
--model-id meta-llama/Llama-3.2-3B-Instruct --port 5009
```
In `run.yaml`, make sure you point the correct server endpoint to the TGI server endpoint serving your model.
```yaml
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009
``` ```

View file

@ -60,7 +60,7 @@ def get_distribution_template() -> DistributionTemplate:
provider_id="tgi-safety", provider_id="tgi-safety",
provider_type="remote::tgi", provider_type="remote::tgi",
config=TGIImplConfig.sample_run_config( config=TGIImplConfig.sample_run_config(
url="${env.SAFETY_TGI_URL}", url="${env.TGI_SAFETY_URL}",
), ),
), ),
], ],
@ -72,7 +72,7 @@ def get_distribution_template() -> DistributionTemplate:
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
), ),
}, },
docker_compose_env_vars={ run_config_env_vars={
"LLAMASTACK_PORT": ( "LLAMASTACK_PORT": (
"5001", "5001",
"Port for the Llama Stack distribution server", "Port for the Llama Stack distribution server",
@ -82,11 +82,11 @@ def get_distribution_template() -> DistributionTemplate:
"Inference model loaded into the TGI server", "Inference model loaded into the TGI server",
), ),
"TGI_URL": ( "TGI_URL": (
"http://host.docker.internal:8080}/v1", "http://127.0.0.1:8080}/v1",
"URL of the TGI server with the main inference model", "URL of the TGI server with the main inference model",
), ),
"SAFETY_TGI_URL": ( "TGI_SAFETY_URL": (
"http://host.docker.internal:8081/v1", "http://127.0.0.1:8081/v1",
"URL of the TGI server with the safety model", "URL of the TGI server with the safety model",
), ),
"SAFETY_MODEL": ( "SAFETY_MODEL": (