Adding docker-compose.yaml, starting to simplify

This commit is contained in:
Ashwin Bharambe 2024-11-16 10:56:38 -08:00
parent e4509cb568
commit f38e76ee98
14 changed files with 516 additions and 386 deletions

View file

@ -9,25 +9,30 @@
# Similarly change "host.docker.internal" to "localhost" in the run.yaml file # Similarly change "host.docker.internal" to "localhost" in the run.yaml file
# #
services: services:
vllm-0: vllm-inference:
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
volumes: volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface - $HOME/.cache/huggingface:/root/.cache/huggingface
# network_mode: "host" network_mode: ${NETWORK_MODE:-bridged}
ports: ports:
- "5100:5100" - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
devices: devices:
- nvidia.com/gpu=all - nvidia.com/gpu=all
environment: environment:
- CUDA_VISIBLE_DEVICES=0 - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: > command: >
--gpu-memory-utilization 0.75 --gpu-memory-utilization 0.75
--model meta-llama/Llama-3.1-8B-Instruct --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--enforce-eager --enforce-eager
--max-model-len 8192 --max-model-len 8192
--max-num-seqs 16 --max-num-seqs 16
--port 5100 --port ${VLLM_INFERENCE_PORT:-5100}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
interval: 30s
timeout: 10s
retries: 5
deploy: deploy:
resources: resources:
reservations: reservations:
@ -35,25 +40,34 @@ services:
- driver: nvidia - driver: nvidia
capabilities: [gpu] capabilities: [gpu]
runtime: nvidia runtime: nvidia
vllm-1:
# A little trick:
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
vllm-${VLLM_SAFETY_MODEL:+safety}:
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
volumes: volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface - $HOME/.cache/huggingface:/root/.cache/huggingface
# network_mode: "host" network_mode: ${NETWORK_MODE:-bridged}
ports: ports:
- "5101:5101" - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
devices: devices:
- nvidia.com/gpu=all - nvidia.com/gpu=all
environment: environment:
- CUDA_VISIBLE_DEVICES=1 - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: > command: >
--gpu-memory-utilization 0.75 --gpu-memory-utilization 0.75
--model meta-llama/Llama-Guard-3-1B --model ${VLLM_SAFETY_MODEL}
--enforce-eager --enforce-eager
--max-model-len 8192 --max-model-len 8192
--max-num-seqs 16 --max-num-seqs 16
--port 5101 --port ${VLLM_SAFETY_PORT:-5101}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
interval: 30s
timeout: 10s
retries: 5
deploy: deploy:
resources: resources:
reservations: reservations:
@ -63,23 +77,25 @@ services:
runtime: nvidia runtime: nvidia
llamastack: llamastack:
depends_on: depends_on:
- vllm-0 - vllm-inference:
- vllm-1 condition: service_healthy
# image: llamastack/distribution-remote-vllm - vllm-${VLLM_SAFETY_MODEL:+safety}:
condition: service_healthy
# image: llamastack/distribution-remote-vllm
image: llamastack/distribution-remote-vllm:test-0.0.52rc3 image: llamastack/distribution-remote-vllm:test-0.0.52rc3
volumes: volumes:
- ~/.llama:/root/.llama - ~/.llama:/root/.llama
- ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
# network_mode: "host" network_mode: ${NETWORK_MODE:-bridged}
environment: environment:
- LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1} - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
- LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct} - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
- MAX_TOKENS=${MAX_TOKENS:-4096} - MAX_TOKENS=${MAX_TOKENS:-4096}
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm} - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
- LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1} - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
- LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
ports: ports:
- "5001:5001" - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
# Hack: wait for vLLM server to start before starting docker # Hack: wait for vLLM server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001" entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
deploy: deploy:
@ -89,6 +105,6 @@ services:
max_attempts: 5 max_attempts: 5
window: 60s window: 60s
volumes: volumes:
vllm-0: vllm-inference:
vllm-1: vllm-safety:
llamastack: llamastack:

View file

@ -0,0 +1,68 @@
version: '2'
built_at: '2024-11-11T20:09:45.988375'
image_name: remote-vllm
docker_image: remote-vllm
conda_env: null
apis:
- inference
- memory
- safety
- agents
- telemetry
providers:
inference:
# serves main inference model
- provider_id: vllm-inference
provider_type: remote::vllm
config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: ${env.VLLM_URL}
max_tokens: ${env.MAX_TOKENS:4096}
api_token: fake
# serves safety llama_guard model
- provider_id: vllm-safety
provider_type: remote::vllm
config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: ${env.SAFETY_VLLM_URL}
max_tokens: ${env.MAX_TOKENS:4096}
api_token: fake
memory:
- provider_id: faiss-0
provider_type: inline::faiss
config:
kvstore:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
memory:
- provider_id: meta0
provider_type: inline::faiss
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db"
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
models:
- model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
- model_id: ${env.SAFETY_MODEL}
provider_id: vllm-safety
shields:
- shield_id: ${env.SAFETY_MODEL}

View file

@ -6,39 +6,25 @@ conda_env: null
apis: apis:
- inference - inference
- memory - memory
- safety
- agents - agents
- telemetry - telemetry
providers: providers:
inference: inference:
# serves main inference model # serves main inference model
- provider_id: vllm-0 - provider_id: vllm-inference
provider_type: remote::vllm provider_type: remote::vllm
config: config:
# NOTE: replace with "localhost" if you are running in "host" network mode url: ${env.VLLM_URL}
url: ${env.VLLM_URL:http://host.docker.internal:5100/v1}
max_tokens: ${env.MAX_TOKENS:4096}
api_token: fake
# serves safety llama_guard model
- provider_id: vllm-1
provider_type: remote::vllm
config:
# NOTE: replace with "localhost" if you are running in "host" network mode
url: ${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
max_tokens: ${env.MAX_TOKENS:4096} max_tokens: ${env.MAX_TOKENS:4096}
api_token: fake api_token: fake
memory: memory:
- provider_id: faiss-0 - provider_id: faiss
provider_type: inline::faiss provider_type: inline::faiss
config: config:
kvstore: kvstore:
namespace: null namespace: null
type: sqlite type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db" db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
memory: memory:
- provider_id: meta0 - provider_id: meta0
provider_type: inline::faiss provider_type: inline::faiss
@ -60,9 +46,5 @@ metadata_store:
type: sqlite type: sqlite
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db" db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
models: models:
- model_id: ${env.INFERENCE_MODEL:Llama3.1-8B-Instruct} - model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-0 provider_id: vllm-inference
- model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
provider_id: vllm-1
shields:
- shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}

View file

@ -4,37 +4,10 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from typing import Optional from .config import OllamaImplConfig
from llama_stack.distribution.datatypes import RemoteProviderConfig
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
DEFAULT_OLLAMA_PORT = 11434 async def get_adapter_impl(config: OllamaImplConfig, _deps):
class OllamaImplConfig(RemoteProviderConfig):
port: int = DEFAULT_OLLAMA_PORT
@classmethod
def sample_docker_compose_config(cls) -> Optional[DockerComposeServiceConfig]:
return DockerComposeServiceConfig(
image="ollama/ollama:latest",
volumes=["$HOME/.ollama:/root/.ollama"],
devices=["nvidia.com/gpu=all"],
deploy={
"resources": {
"reservations": {
"devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
}
}
},
runtime="nvidia",
ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
)
async def get_adapter_impl(config: RemoteProviderConfig, _deps):
from .ollama import OllamaInferenceAdapter from .ollama import OllamaInferenceAdapter
impl = OllamaInferenceAdapter(config.url) impl = OllamaInferenceAdapter(config.url)

View file

@ -0,0 +1,65 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import List
from llama_stack.distribution.datatypes import RemoteProviderConfig
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
DEFAULT_OLLAMA_PORT = 11434
class OllamaImplConfig(RemoteProviderConfig):
port: int = DEFAULT_OLLAMA_PORT
@classmethod
def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]:
return [
DockerComposeServiceConfig(
service_name="ollama",
image="ollama/ollama:latest",
volumes=["$HOME/.ollama:/root/.ollama"],
devices=["nvidia.com/gpu=all"],
deploy={
"resources": {
"reservations": {
"devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
}
}
},
runtime="nvidia",
ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
healthcheck={
"test": ["CMD", "curl", "-f", "http://ollama:11434"],
"interval": "10s",
"timeout": "5s",
"retries": 5,
},
),
DockerComposeServiceConfig(
service_name="ollama-init",
image="ollama/ollama",
depends_on={"ollama": {"condition": "service_healthy"}},
environment={
"OLLAMA_HOST": "ollama",
"OLLAMA_MODELS": "${OLLAMA_MODELS}",
},
volumes=["ollama_data:/root/.ollama"],
entrypoint=(
'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";'
"until curl -s http://ollama:11434 > /dev/null; do"
"attempt=$((attempt + 1));"
"if [ $attempt -ge $max_attempts ]; then"
'echo "Timeout waiting for Ollama server";'
"exit 1;"
"fi;"
'echo "Attempt $attempt: Server not ready yet...";'
"sleep 5;"
"done'"
),
),
]

View file

@ -0,0 +1,55 @@
services:
${SERVICE_NAME:-ollama}:
image: ollama/ollama:latest
ports:
- ${OLLAMA_PORT:-11434}:${OLLAMA_PORT:-11434}
volumes:
- $HOME/.ollama:/root/.ollama
devices:
- nvidia.com/gpu=all
runtime: nvidia
healthcheck:
test: ["CMD", "curl", "-f", "http://ollama:11434"]
interval: 10s
timeout: 5s
retries: 5
${SERVICE_NAME:-ollama}-init:
image: ollama/ollama
depends_on:
- ${SERVICE_NAME:-ollama}:
condition: service_healthy
environment:
- OLLAMA_HOST=ollama
- OLLAMA_MODELS=${OLLAMA_MODELS}
volumes:
- $HOME/.ollama:/root/.ollama
entrypoint: >
sh -c '
max_attempts=30;
attempt=0;
echo "Waiting for Ollama server...";
until curl -s http://ollama:11434 > /dev/null; do
attempt=$((attempt + 1));
if [ $attempt -ge $max_attempts ]; then
echo "Timeout waiting for Ollama server";
exit 1;
fi;
echo "Attempt $attempt: Server not ready yet...";
sleep 5;
done;
echo "Server ready. Pulling models...";
models="${OLLAMA_MODELS}";
for model in $models; do
echo "Pulling $model...";
if ! ollama pull "$model"; then
echo "Failed to pull $model";
exit 1;
fi;
done;
echo "All models pulled successfully"
'

View file

@ -0,0 +1,35 @@
services:
${SERVICE_NAME:-tgi}:
image: ghcr.io/huggingface/text-generation-inference:2.3.1
network_mode: "host"
volumes:
- $HOME/.cache/huggingface:/data
ports:
- ${TGI_PORT:-8000}:${TGI_PORT:-8000}
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: >
--dtype bfloat16
--usage-stats off
--sharded false
--model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--port ${TGI_PORT:-8000}
--cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
healthcheck:
test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
interval: 5s
timeout: 5s
retries: 30

View file

@ -9,11 +9,6 @@ from typing import Optional
from llama_models.schema_utils import json_schema_type from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
DEFAULT_VLLM_PORT = 8000
@json_schema_type @json_schema_type
class VLLMInferenceAdapterConfig(BaseModel): class VLLMInferenceAdapterConfig(BaseModel):
@ -33,48 +28,10 @@ class VLLMInferenceAdapterConfig(BaseModel):
@classmethod @classmethod
def sample_run_config( def sample_run_config(
cls, cls,
url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}", url: str = "${env.VLLM_URL}",
): ):
return { return {
"url": url, "url": url,
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}", "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
"api_token": "${env.VLLM_API_TOKEN:fake}", "api_token": "${env.VLLM_API_TOKEN:fake}",
} }
@classmethod
def sample_docker_compose_config(
cls,
port: int = DEFAULT_VLLM_PORT,
cuda_visible_devices: str = "0",
model: str = "meta-llama/Llama-3.2-3B-Instruct",
) -> Optional[DockerComposeServiceConfig]:
return DockerComposeServiceConfig(
image="vllm/vllm-openai:latest",
volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"],
devices=["nvidia.com/gpu=all"],
deploy={
"resources": {
"reservations": {
"devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
}
}
},
runtime="nvidia",
ports=[f"{port}:{port}"],
environment={
"CUDA_VISIBLE_DEVICES": cuda_visible_devices,
"HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN",
},
command=(
" ".join(
[
"--gpu-memory-utilization 0.75",
f"--model {model}",
"--enforce-eager",
"--max-model-len 8192",
"--max-num-seqs 16",
f"--port {port}",
]
)
),
)

View file

@ -0,0 +1,26 @@
services:
${SERVICE_NAME:-vllm}:
image: vllm/vllm-openai:latest
ports:
- ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
devices:
- nvidia.com/gpu=all
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
environment:
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
command: >
--gpu-memory-utilization 0.75
--model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port ${VLLM_PORT:-5100}

View file

@ -1,5 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

View file

@ -1,29 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Any, Dict, List, Optional
from pydantic import BaseModel
class LiteralString(str):
pass # Marker class for strings we want to format with >
class DockerComposeServiceConfig(BaseModel):
"""Configuration for a single service in docker-compose."""
image: str
volumes: Optional[List[str]] = None
network_mode: str = "bridge"
ports: Optional[List[str]] = None
devices: Optional[List[str]] = None
environment: Optional[Dict[str, str]] = None
command: Optional[str] = None
depends_on: Optional[List[str]] = None
deploy: Optional[Dict[str, Any]] = None
runtime: Optional[str] = None
entrypoint: Optional[str] = None

View file

@ -54,11 +54,11 @@ class SqliteKVStoreConfig(CommonConfig):
) )
@classmethod @classmethod
def sample_run_config(cls, db_name: str = "kvstore.db"): def sample_run_config(cls, dir: str = "runtime", db_name: str = "kvstore.db"):
return { return {
"type": "sqlite", "type": "sqlite",
"namespace": None, "namespace": None,
"db_path": "${env.SQLITE_STORE_DIR:~/.llama/runtime/" + db_name + "}", "db_path": "${env.SQLITE_STORE_DIR:~/.llama/" + f"{dir}/{db_name}" + "}",
} }

View file

@ -0,0 +1,95 @@
# Remote vLLM Distribution
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
{{ providers_table }}
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
{%- if docker_compose_env_vars %}
### Environment Variables
The following environment variables can be configured:
{% for var, (default_value, description) in docker_compose_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
{% endfor %}
{% endif %}
{% if default_models %}
### Models
The following models are configured by default:
{% for model in default_models %}
- `{{ model.model_id }}`
{% endfor %}
{% endif %}
## Using Docker Compose
You can use `docker compose` to start a vLLM container and Llama Stack server container together.
```bash
$ cd distributions/{{ name }}; docker compose up
```
You will see outputs similar to following ---
```
<TO BE FILLED>
```
To kill the server
```bash
docker compose down
```
## Starting vLLM and Llama Stack separately
You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
#### Start vLLM server.
```bash
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-p 8000:8000 \
--ipc=host \
vllm/vllm-openai:latest \
--model meta-llama/Llama-3.2-3B-Instruct
```
Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
#### Start Llama Stack server pointing to your vLLM server
We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following:
```yaml
inference:
- provider_id: vllm0
provider_type: remote::vllm
config:
url: http://127.0.0.1:8000
```
**Via Conda**
If you are using Conda, you can build and run the Llama Stack server with the following commands:
```bash
cd distributions/remote-vllm
llama stack build --template remote_vllm --image-type conda
llama stack run run.yaml
```
**Via Docker**
You can use the Llama Stack Docker image to start the server with the following command:
```bash
docker run --network host -it -p 5000:5000 \
-v ~/.llama:/root/.llama \
-v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \
--gpus=all \
llamastack/distribution-remote-vllm \
--yaml_config /root/llamastack-run-remote-vllm.yaml
```

View file

@ -9,7 +9,7 @@ from datetime import datetime
from io import StringIO from io import StringIO
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple from typing import Dict, List, Optional, Set, Tuple
import jinja2 import jinja2
import yaml import yaml
@ -22,7 +22,6 @@ from llama_stack.distribution.datatypes import (
Api, Api,
BuildConfig, BuildConfig,
DistributionSpec, DistributionSpec,
KVStoreConfig,
ModelInput, ModelInput,
Provider, Provider,
ShieldInput, ShieldInput,
@ -33,53 +32,26 @@ from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.providers.remote.inference.vllm.config import ( from llama_stack.providers.remote.inference.vllm.config import (
VLLMInferenceAdapterConfig, VLLMInferenceAdapterConfig,
) )
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
class DistributionTemplate(BaseModel): class RunConfigSettings(BaseModel):
""" provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
Represents a Llama Stack distribution instance that can generate configuration
and documentation files.
"""
name: str
description: str
providers: Dict[str, List[str]]
run_config_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
compose_config_overrides: Dict[str, Dict[str, DockerComposeServiceConfig]] = Field(
default_factory=dict
)
default_models: List[ModelInput] default_models: List[ModelInput]
default_shields: Optional[List[ShieldInput]] = None default_shields: Optional[List[ShieldInput]] = None
# Optional configuration def run_config(
metadata_store: Optional[KVStoreConfig] = None self,
docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None name: str,
docker_image: Optional[str] = None providers: Dict[str, List[str]],
docker_image: Optional[str] = None,
@property ) -> StackRunConfig:
def distribution_spec(self) -> DistributionSpec:
return DistributionSpec(
description=self.description,
docker_image=self.docker_image,
providers=self.providers,
)
def build_config(self) -> BuildConfig:
return BuildConfig(
name=self.name,
distribution_spec=self.distribution_spec,
image_type="conda", # default to conda, can be overridden
)
def run_config(self) -> StackRunConfig:
provider_registry = get_provider_registry() provider_registry = get_provider_registry()
provider_configs = {} provider_configs = {}
for api_str, provider_types in self.providers.items(): for api_str, provider_types in providers.items():
if providers := self.run_config_overrides.get(api_str): if api_providers := self.provider_overrides.get(api_str):
provider_configs[api_str] = providers provider_configs[api_str] = api_providers
continue continue
provider_type = provider_types[0] provider_type = provider_types[0]
@ -111,83 +83,53 @@ class DistributionTemplate(BaseModel):
] ]
# Get unique set of APIs from providers # Get unique set of APIs from providers
apis: Set[str] = set(self.providers.keys()) apis: Set[str] = set(providers.keys())
return StackRunConfig( return StackRunConfig(
image_name=self.name, image_name=name,
docker_image=self.docker_image, docker_image=docker_image,
built_at=datetime.now(), built_at=datetime.now(),
apis=list(apis), apis=list(apis),
providers=provider_configs, providers=provider_configs,
metadata_store=self.metadata_store, metadata_store=SqliteKVStoreConfig.sample_run_config(
dir=f"distributions/{name}",
db_name="registry.db",
),
models=self.default_models, models=self.default_models,
shields=self.default_shields or [], shields=self.default_shields or [],
) )
def docker_compose_config(self) -> Dict[str, Any]:
services = {}
provider_registry = get_provider_registry()
# Add provider services based on their sample_compose_config class DistributionTemplate(BaseModel):
for api_str, api_providers in self.providers.items(): """
if overrides := self.compose_config_overrides.get(api_str): Represents a Llama Stack distribution instance that can generate configuration
services |= overrides and documentation files.
continue """
# only look at the first provider to get the compose config for now name: str
# we may want to use `docker compose profiles` in the future description: str
provider_type = api_providers[0]
provider_id = provider_type.split("::")[-1]
api = Api(api_str)
if provider_type not in provider_registry[api]:
raise ValueError(
f"Unknown provider type: {provider_type} for API: {api_str}"
)
config_class = provider_registry[api][provider_type].config_class providers: Dict[str, List[str]]
assert ( run_configs: Dict[str, RunConfigSettings]
config_class is not None template_path: Path
), f"No config class for provider type: {provider_type} for API: {api_str}"
config_class = instantiate_class_type(config_class) # Optional configuration
if not hasattr(config_class, "sample_docker_compose_config"): docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
continue docker_image: Optional[str] = None
compose_config = config_class.sample_docker_compose_config() default_models: Optional[List[ModelInput]] = None
services[provider_id] = compose_config
port = "${LLAMASTACK_PORT:-5001}" def build_config(self) -> BuildConfig:
# Add main llamastack service return BuildConfig(
llamastack_config = DockerComposeServiceConfig( name=self.name,
image=f"llamastack/distribution-{self.name}:latest", distribution_spec=DistributionSpec(
depends_on=list(services.keys()), description=self.description,
volumes=[ docker_image=self.docker_image,
"~/.llama:/root/.llama", providers=self.providers,
f"~/local/llama-stack/distributions/{self.name}/run.yaml:/root/llamastack-run-{self.name}.yaml",
],
ports=[f"{port}:{port}"],
environment={
k: v[0] for k, v in (self.docker_compose_env_vars or {}).items()
},
entrypoint=(
f'bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-{self.name}.yaml --port {port}"'
), ),
deploy={ image_type="conda", # default to conda, can be overridden
"restart_policy": {
"condition": "on-failure",
"delay": "3s",
"max_attempts": 5,
"window": "60s",
}
},
) )
services["llamastack"] = llamastack_config
return {
"services": {k: v.model_dump() for k, v in services.items()},
"volumes": {service_name: None for service_name in services.keys()},
}
def generate_markdown_docs(self) -> str: def generate_markdown_docs(self) -> str:
"""Generate markdown documentation using both Jinja2 templates and rich tables.""" """Generate markdown documentation using both Jinja2 templates and rich tables."""
# First generate the providers table using rich # First generate the providers table using rich
@ -204,53 +146,7 @@ class DistributionTemplate(BaseModel):
console.print(table) console.print(table)
providers_table = output.getvalue() providers_table = output.getvalue()
# Main documentation template template = self.template_path.read_text()
template = """# {{ name }} Distribution
{{ description }}
## Provider Configuration
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
{{ providers_table }}
{%- if env_vars %}
## Environment Variables
The following environment variables can be configured:
{% for var, (value, description) in docker_compose_env_vars.items() %}
- `{{ var }}`: {{ description }}
{% endfor %}
{%- endif %}
## Example Usage
### Using Docker Compose
```bash
$ cd distributions/{{ name }}
$ docker compose up
```
## Models
The following models are configured by default:
{% for model in default_models %}
- `{{ model.model_id }}`
{% endfor %}
{%- if default_shields %}
## Safety Shields
The following safety shields are configured:
{% for shield in default_shields %}
- `{{ shield.shield_id }}`
{%- endfor %}
{%- endif %}
"""
# Render template with rich-generated table # Render template with rich-generated table
env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True) env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True)
template = env.from_string(template) template = env.from_string(template)
@ -261,7 +157,6 @@ The following safety shields are configured:
providers_table=providers_table, providers_table=providers_table,
docker_compose_env_vars=self.docker_compose_env_vars, docker_compose_env_vars=self.docker_compose_env_vars,
default_models=self.default_models, default_models=self.default_models,
default_shields=self.default_shields,
) )
def save_distribution(self, output_dir: Path) -> None: def save_distribution(self, output_dir: Path) -> None:
@ -271,19 +166,14 @@ The following safety shields are configured:
with open(output_dir / "build.yaml", "w") as f: with open(output_dir / "build.yaml", "w") as f:
yaml.safe_dump(build_config.model_dump(), f, sort_keys=False) yaml.safe_dump(build_config.model_dump(), f, sort_keys=False)
run_config = self.run_config() for yaml_pth, settings in self.run_configs.items():
serialized = run_config.model_dump() print(f"Generating {yaml_pth}")
with open(output_dir / "run.yaml", "w") as f: print(f"Providers: {self.providers}")
yaml.safe_dump(serialized, f, sort_keys=False) run_config = settings.run_config(
self.name, self.providers, self.docker_image
# serialized_str = yaml.dump(serialized, sort_keys=False) )
# env_vars = set() with open(output_dir / yaml_pth, "w") as f:
# for match in re.finditer(r"\${env\.([A-Za-z0-9_-]+)}", serialized_str): yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
# env_vars.add(match.group(1))
docker_compose = self.docker_compose_config()
with open(output_dir / "compose.yaml", "w") as f:
yaml.safe_dump(docker_compose, f, sort_keys=False, default_flow_style=False)
docs = self.generate_markdown_docs() docs = self.generate_markdown_docs()
with open(output_dir / f"{self.name}.md", "w") as f: with open(output_dir / f"{self.name}.md", "w") as f:
@ -291,87 +181,89 @@ The following safety shields are configured:
@classmethod @classmethod
def vllm_distribution(cls) -> "DistributionTemplate": def vllm_distribution(cls) -> "DistributionTemplate":
providers = {
"inference": ["remote::vllm"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="vllm-inference",
provider_type="remote::vllm",
config=VLLMInferenceAdapterConfig.sample_run_config(
url="${env.VLLM_URL}",
),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="vllm-inference",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="vllm-safety",
)
return cls( return cls(
name="remote-vllm", name="remote-vllm",
description="Use (an external) vLLM server for running LLM inference", description="Use (an external) vLLM server for running LLM inference",
providers={ template_path=Path(__file__).parent / "remote-vllm" / "doc_template.md",
"inference": ["remote::vllm"], providers=providers,
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], default_models=[inference_model, safety_model],
"safety": ["inline::llama-guard"], run_configs={
"agents": ["inline::meta-reference"], "run.yaml": RunConfigSettings(
"telemetry": ["inline::meta-reference"], provider_overrides={
}, "inference": [inference_provider],
run_config_overrides={ },
"inference": [ default_models=[inference_model],
Provider(
provider_id="vllm-0",
provider_type="remote::vllm",
config=VLLMInferenceAdapterConfig.sample_run_config(
url="${env.VLLM_URL:http://host.docker.internal:5100/v1}",
),
),
Provider(
provider_id="vllm-1",
provider_type="remote::vllm",
config=VLLMInferenceAdapterConfig.sample_run_config(
url="${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}",
),
),
]
},
compose_config_overrides={
"inference": {
"vllm-0": VLLMInferenceAdapterConfig.sample_docker_compose_config(
port=5100,
cuda_visible_devices="0",
model="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
),
"vllm-1": VLLMInferenceAdapterConfig.sample_docker_compose_config(
port=5100,
cuda_visible_devices="1",
model="${env.SAFETY_MODEL:Llama-Guard-3-1B}",
),
}
},
default_models=[
ModelInput(
model_id="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
provider_id="vllm-0",
), ),
ModelInput( "safety-run.yaml": RunConfigSettings(
model_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}", provider_overrides={
provider_id="vllm-1", "inference": [
inference_provider,
Provider(
provider_id="vllm-safety",
provider_type="remote::vllm",
config=VLLMInferenceAdapterConfig.sample_run_config(
url="${env.SAFETY_VLLM_URL}",
),
),
],
},
default_models=[
inference_model,
safety_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
), ),
], },
default_shields=[
ShieldInput(shield_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}")
],
docker_compose_env_vars={ docker_compose_env_vars={
# these defaults are for the Docker Compose configuration "LLAMASTACK_PORT": (
"VLLM_URL": ( "5001",
"http://host.docker.internal:${VLLM_PORT:-5100}/v1", "Port for the Llama Stack distribution server",
"URL of the vLLM server with the main inference model",
),
"SAFETY_VLLM_URL": (
"http://host.docker.internal:${SAFETY_VLLM_PORT:-5101}/v1",
"URL of the vLLM server with the safety model",
),
"MAX_TOKENS": (
"${MAX_TOKENS:-4096}",
"Maximum number of tokens for generation",
), ),
"INFERENCE_MODEL": ( "INFERENCE_MODEL": (
"${INFERENCE_MODEL:-Llama3.2-3B-Instruct}", "meta-llama/Llama-3.2-3B-Instruct",
"Name of the inference model to use", "Inference model loaded into the vLLM server",
),
"VLLM_URL": (
"http://host.docker.internal:5100}/v1",
"URL of the vLLM server with the main inference model",
),
"MAX_TOKENS": (
"4096",
"Maximum number of tokens for generation",
),
"SAFETY_VLLM_URL": (
"http://host.docker.internal:5101/v1",
"URL of the vLLM server with the safety model",
), ),
"SAFETY_MODEL": ( "SAFETY_MODEL": (
"${SAFETY_MODEL:-Llama-Guard-3-1B}", "meta-llama/Llama-Guard-3-1B",
"Name of the safety (Llama-Guard) model to use", "Name of the safety (Llama-Guard) model to use",
), ),
"LLAMASTACK_PORT": (
"${LLAMASTACK_PORT:-5001}",
"Port for the Llama Stack distribution server",
),
}, },
) )