From f38e76ee98cfce227fd91c8a98adcca4a619a98c Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sat, 16 Nov 2024 10:56:38 -0800 Subject: [PATCH] Adding docker-compose.yaml, starting to simplify --- distributions/remote-vllm/compose.yaml | 64 ++-- .../remote-vllm/run-with-safety.yaml | 68 ++++ distributions/remote-vllm/run.yaml | 28 +- .../remote/inference/ollama/__init__.py | 31 +- .../remote/inference/ollama/config.py | 65 ++++ .../inference/ollama/docker_compose.yaml | 55 +++ .../remote/inference/tgi/docker_compose.yaml | 35 ++ .../providers/remote/inference/vllm/config.py | 45 +-- .../remote/inference/vllm/docker_compose.yaml | 26 ++ .../providers/utils/docker/__init__.py | 5 - .../providers/utils/docker/service_config.py | 29 -- llama_stack/providers/utils/kvstore/config.py | 4 +- .../templates/remote-vllm/doc_template.md | 95 +++++ llama_stack/templates/template.py | 352 ++++++------------ 14 files changed, 516 insertions(+), 386 deletions(-) create mode 100644 distributions/remote-vllm/run-with-safety.yaml create mode 100644 llama_stack/providers/remote/inference/ollama/config.py create mode 100644 llama_stack/providers/remote/inference/ollama/docker_compose.yaml create mode 100644 llama_stack/providers/remote/inference/tgi/docker_compose.yaml create mode 100644 llama_stack/providers/remote/inference/vllm/docker_compose.yaml delete mode 100644 llama_stack/providers/utils/docker/__init__.py delete mode 100644 llama_stack/providers/utils/docker/service_config.py create mode 100644 llama_stack/templates/remote-vllm/doc_template.md diff --git a/distributions/remote-vllm/compose.yaml b/distributions/remote-vllm/compose.yaml index 90d58a2af..a370df619 100644 --- a/distributions/remote-vllm/compose.yaml +++ b/distributions/remote-vllm/compose.yaml @@ -9,25 +9,30 @@ # Similarly change "host.docker.internal" to "localhost" in the run.yaml file # services: - vllm-0: + vllm-inference: image: vllm/vllm-openai:latest volumes: - $HOME/.cache/huggingface:/root/.cache/huggingface - # network_mode: "host" + network_mode: ${NETWORK_MODE:-bridged} ports: - - "5100:5100" + - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}" devices: - nvidia.com/gpu=all environment: - - CUDA_VISIBLE_DEVICES=0 + - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0} - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN command: > --gpu-memory-utilization 0.75 - --model meta-llama/Llama-3.1-8B-Instruct + --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} --enforce-eager --max-model-len 8192 --max-num-seqs 16 - --port 5100 + --port ${VLLM_INFERENCE_PORT:-5100} + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"] + interval: 30s + timeout: 10s + retries: 5 deploy: resources: reservations: @@ -35,25 +40,34 @@ services: - driver: nvidia capabilities: [gpu] runtime: nvidia - vllm-1: + + # A little trick: + # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model + # otherwise, the entry will end in a hyphen which gets ignored by docker compose + vllm-${VLLM_SAFETY_MODEL:+safety}: image: vllm/vllm-openai:latest volumes: - $HOME/.cache/huggingface:/root/.cache/huggingface - # network_mode: "host" + network_mode: ${NETWORK_MODE:-bridged} ports: - - "5101:5101" + - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}" devices: - nvidia.com/gpu=all environment: - - CUDA_VISIBLE_DEVICES=1 + - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1} - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN command: > --gpu-memory-utilization 0.75 - --model meta-llama/Llama-Guard-3-1B + --model ${VLLM_SAFETY_MODEL} --enforce-eager --max-model-len 8192 --max-num-seqs 16 - --port 5101 + --port ${VLLM_SAFETY_PORT:-5101} + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"] + interval: 30s + timeout: 10s + retries: 5 deploy: resources: reservations: @@ -63,23 +77,25 @@ services: runtime: nvidia llamastack: depends_on: - - vllm-0 - - vllm-1 - # image: llamastack/distribution-remote-vllm + - vllm-inference: + condition: service_healthy + - vllm-${VLLM_SAFETY_MODEL:+safety}: + condition: service_healthy + # image: llamastack/distribution-remote-vllm image: llamastack/distribution-remote-vllm:test-0.0.52rc3 volumes: - ~/.llama:/root/.llama - - ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml - # network_mode: "host" + - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml + network_mode: ${NETWORK_MODE:-bridged} environment: - - LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1} - - LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct} + - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1 + - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1 + - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - MAX_TOKENS=${MAX_TOKENS:-4096} - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm} - - LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1} - - LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B} + - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} ports: - - "5001:5001" + - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}" # Hack: wait for vLLM server to start before starting docker entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001" deploy: @@ -89,6 +105,6 @@ services: max_attempts: 5 window: 60s volumes: - vllm-0: - vllm-1: + vllm-inference: + vllm-safety: llamastack: diff --git a/distributions/remote-vllm/run-with-safety.yaml b/distributions/remote-vllm/run-with-safety.yaml new file mode 100644 index 000000000..35f87c0bb --- /dev/null +++ b/distributions/remote-vllm/run-with-safety.yaml @@ -0,0 +1,68 @@ +version: '2' +built_at: '2024-11-11T20:09:45.988375' +image_name: remote-vllm +docker_image: remote-vllm +conda_env: null +apis: +- inference +- memory +- safety +- agents +- telemetry +providers: + inference: + # serves main inference model + - provider_id: vllm-inference + provider_type: remote::vllm + config: + # NOTE: replace with "localhost" if you are running in "host" network mode + url: ${env.VLLM_URL} + max_tokens: ${env.MAX_TOKENS:4096} + api_token: fake + # serves safety llama_guard model + - provider_id: vllm-safety + provider_type: remote::vllm + config: + # NOTE: replace with "localhost" if you are running in "host" network mode + url: ${env.SAFETY_VLLM_URL} + max_tokens: ${env.MAX_TOKENS:4096} + api_token: fake + memory: + - provider_id: faiss-0 + provider_type: inline::faiss + config: + kvstore: + namespace: null + type: sqlite + db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db" + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + memory: + - provider_id: meta0 + provider_type: inline::faiss + config: {} + agents: + - provider_id: meta0 + provider_type: inline::meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db" + telemetry: + - provider_id: meta0 + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db" +models: + - model_id: ${env.INFERENCE_MODEL} + provider_id: vllm-inference + - model_id: ${env.SAFETY_MODEL} + provider_id: vllm-safety +shields: + - shield_id: ${env.SAFETY_MODEL} diff --git a/distributions/remote-vllm/run.yaml b/distributions/remote-vllm/run.yaml index e6be2bd06..847dc1dd1 100644 --- a/distributions/remote-vllm/run.yaml +++ b/distributions/remote-vllm/run.yaml @@ -6,39 +6,25 @@ conda_env: null apis: - inference - memory -- safety - agents - telemetry providers: inference: # serves main inference model - - provider_id: vllm-0 + - provider_id: vllm-inference provider_type: remote::vllm config: - # NOTE: replace with "localhost" if you are running in "host" network mode - url: ${env.VLLM_URL:http://host.docker.internal:5100/v1} - max_tokens: ${env.MAX_TOKENS:4096} - api_token: fake - # serves safety llama_guard model - - provider_id: vllm-1 - provider_type: remote::vllm - config: - # NOTE: replace with "localhost" if you are running in "host" network mode - url: ${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1} + url: ${env.VLLM_URL} max_tokens: ${env.MAX_TOKENS:4096} api_token: fake memory: - - provider_id: faiss-0 + - provider_id: faiss provider_type: inline::faiss config: kvstore: namespace: null type: sqlite db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db" - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: {} memory: - provider_id: meta0 provider_type: inline::faiss @@ -60,9 +46,5 @@ metadata_store: type: sqlite db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db" models: - - model_id: ${env.INFERENCE_MODEL:Llama3.1-8B-Instruct} - provider_id: vllm-0 - - model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B} - provider_id: vllm-1 -shields: - - shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B} + - model_id: ${env.INFERENCE_MODEL} + provider_id: vllm-inference diff --git a/llama_stack/providers/remote/inference/ollama/__init__.py b/llama_stack/providers/remote/inference/ollama/__init__.py index adc4845d1..073c31cde 100644 --- a/llama_stack/providers/remote/inference/ollama/__init__.py +++ b/llama_stack/providers/remote/inference/ollama/__init__.py @@ -4,37 +4,10 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional - -from llama_stack.distribution.datatypes import RemoteProviderConfig -from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig +from .config import OllamaImplConfig -DEFAULT_OLLAMA_PORT = 11434 - - -class OllamaImplConfig(RemoteProviderConfig): - port: int = DEFAULT_OLLAMA_PORT - - @classmethod - def sample_docker_compose_config(cls) -> Optional[DockerComposeServiceConfig]: - return DockerComposeServiceConfig( - image="ollama/ollama:latest", - volumes=["$HOME/.ollama:/root/.ollama"], - devices=["nvidia.com/gpu=all"], - deploy={ - "resources": { - "reservations": { - "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}] - } - } - }, - runtime="nvidia", - ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"], - ) - - -async def get_adapter_impl(config: RemoteProviderConfig, _deps): +async def get_adapter_impl(config: OllamaImplConfig, _deps): from .ollama import OllamaInferenceAdapter impl = OllamaInferenceAdapter(config.url) diff --git a/llama_stack/providers/remote/inference/ollama/config.py b/llama_stack/providers/remote/inference/ollama/config.py new file mode 100644 index 000000000..5bf8b98e7 --- /dev/null +++ b/llama_stack/providers/remote/inference/ollama/config.py @@ -0,0 +1,65 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List + +from llama_stack.distribution.datatypes import RemoteProviderConfig +from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig + + +DEFAULT_OLLAMA_PORT = 11434 + + +class OllamaImplConfig(RemoteProviderConfig): + port: int = DEFAULT_OLLAMA_PORT + + @classmethod + def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]: + return [ + DockerComposeServiceConfig( + service_name="ollama", + image="ollama/ollama:latest", + volumes=["$HOME/.ollama:/root/.ollama"], + devices=["nvidia.com/gpu=all"], + deploy={ + "resources": { + "reservations": { + "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}] + } + } + }, + runtime="nvidia", + ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"], + healthcheck={ + "test": ["CMD", "curl", "-f", "http://ollama:11434"], + "interval": "10s", + "timeout": "5s", + "retries": 5, + }, + ), + DockerComposeServiceConfig( + service_name="ollama-init", + image="ollama/ollama", + depends_on={"ollama": {"condition": "service_healthy"}}, + environment={ + "OLLAMA_HOST": "ollama", + "OLLAMA_MODELS": "${OLLAMA_MODELS}", + }, + volumes=["ollama_data:/root/.ollama"], + entrypoint=( + 'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";' + "until curl -s http://ollama:11434 > /dev/null; do" + "attempt=$((attempt + 1));" + "if [ $attempt -ge $max_attempts ]; then" + 'echo "Timeout waiting for Ollama server";' + "exit 1;" + "fi;" + 'echo "Attempt $attempt: Server not ready yet...";' + "sleep 5;" + "done'" + ), + ), + ] diff --git a/llama_stack/providers/remote/inference/ollama/docker_compose.yaml b/llama_stack/providers/remote/inference/ollama/docker_compose.yaml new file mode 100644 index 000000000..9bb7a143b --- /dev/null +++ b/llama_stack/providers/remote/inference/ollama/docker_compose.yaml @@ -0,0 +1,55 @@ +services: + ${SERVICE_NAME:-ollama}: + image: ollama/ollama:latest + ports: + - ${OLLAMA_PORT:-11434}:${OLLAMA_PORT:-11434} + volumes: + - $HOME/.ollama:/root/.ollama + devices: + - nvidia.com/gpu=all + runtime: nvidia + healthcheck: + test: ["CMD", "curl", "-f", "http://ollama:11434"] + interval: 10s + timeout: 5s + retries: 5 + + ${SERVICE_NAME:-ollama}-init: + image: ollama/ollama + depends_on: + - ${SERVICE_NAME:-ollama}: + condition: service_healthy + environment: + - OLLAMA_HOST=ollama + - OLLAMA_MODELS=${OLLAMA_MODELS} + volumes: + - $HOME/.ollama:/root/.ollama + entrypoint: > + sh -c ' + max_attempts=30; + attempt=0; + + echo "Waiting for Ollama server..."; + until curl -s http://ollama:11434 > /dev/null; do + attempt=$((attempt + 1)); + if [ $attempt -ge $max_attempts ]; then + echo "Timeout waiting for Ollama server"; + exit 1; + fi; + echo "Attempt $attempt: Server not ready yet..."; + sleep 5; + done; + + echo "Server ready. Pulling models..."; + + models="${OLLAMA_MODELS}"; + for model in $models; do + echo "Pulling $model..."; + if ! ollama pull "$model"; then + echo "Failed to pull $model"; + exit 1; + fi; + done; + + echo "All models pulled successfully" + ' diff --git a/llama_stack/providers/remote/inference/tgi/docker_compose.yaml b/llama_stack/providers/remote/inference/tgi/docker_compose.yaml new file mode 100644 index 000000000..06638c28c --- /dev/null +++ b/llama_stack/providers/remote/inference/tgi/docker_compose.yaml @@ -0,0 +1,35 @@ +services: + ${SERVICE_NAME:-tgi}: + image: ghcr.io/huggingface/text-generation-inference:2.3.1 + network_mode: "host" + volumes: + - $HOME/.cache/huggingface:/data + ports: + - ${TGI_PORT:-8000}:${TGI_PORT:-8000} + devices: + - nvidia.com/gpu=all + environment: + - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} + - HF_HOME=/data + - HF_DATASETS_CACHE=/data + - HF_MODULES_CACHE=/data + - HF_HUB_CACHE=/data + command: > + --dtype bfloat16 + --usage-stats off + --sharded false + --model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct} + --port ${TGI_PORT:-8000} + --cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8} + deploy: + resources: + reservations: + devices: + - driver: nvidia + capabilities: [gpu] + runtime: nvidia + healthcheck: + test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"] + interval: 5s + timeout: 5s + retries: 30 diff --git a/llama_stack/providers/remote/inference/vllm/config.py b/llama_stack/providers/remote/inference/vllm/config.py index 6a3419cd2..e1d932c87 100644 --- a/llama_stack/providers/remote/inference/vllm/config.py +++ b/llama_stack/providers/remote/inference/vllm/config.py @@ -9,11 +9,6 @@ from typing import Optional from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field -from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig - - -DEFAULT_VLLM_PORT = 8000 - @json_schema_type class VLLMInferenceAdapterConfig(BaseModel): @@ -33,48 +28,10 @@ class VLLMInferenceAdapterConfig(BaseModel): @classmethod def sample_run_config( cls, - url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}", + url: str = "${env.VLLM_URL}", ): return { "url": url, "max_tokens": "${env.VLLM_MAX_TOKENS:4096}", "api_token": "${env.VLLM_API_TOKEN:fake}", } - - @classmethod - def sample_docker_compose_config( - cls, - port: int = DEFAULT_VLLM_PORT, - cuda_visible_devices: str = "0", - model: str = "meta-llama/Llama-3.2-3B-Instruct", - ) -> Optional[DockerComposeServiceConfig]: - return DockerComposeServiceConfig( - image="vllm/vllm-openai:latest", - volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"], - devices=["nvidia.com/gpu=all"], - deploy={ - "resources": { - "reservations": { - "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}] - } - } - }, - runtime="nvidia", - ports=[f"{port}:{port}"], - environment={ - "CUDA_VISIBLE_DEVICES": cuda_visible_devices, - "HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN", - }, - command=( - " ".join( - [ - "--gpu-memory-utilization 0.75", - f"--model {model}", - "--enforce-eager", - "--max-model-len 8192", - "--max-num-seqs 16", - f"--port {port}", - ] - ) - ), - ) diff --git a/llama_stack/providers/remote/inference/vllm/docker_compose.yaml b/llama_stack/providers/remote/inference/vllm/docker_compose.yaml new file mode 100644 index 000000000..227842272 --- /dev/null +++ b/llama_stack/providers/remote/inference/vllm/docker_compose.yaml @@ -0,0 +1,26 @@ +services: + ${SERVICE_NAME:-vllm}: + image: vllm/vllm-openai:latest + ports: + - ${VLLM_PORT:-5100}:${VLLM_PORT:-5100} + volumes: + - $HOME/.cache/huggingface:/root/.cache/huggingface + devices: + - nvidia.com/gpu=all + deploy: + resources: + reservations: + devices: + - driver: nvidia + capabilities: [gpu] + runtime: nvidia + environment: + - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} + - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN} + command: > + --gpu-memory-utilization 0.75 + --model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct} + --enforce-eager + --max-model-len 8192 + --max-num-seqs 16 + --port ${VLLM_PORT:-5100} diff --git a/llama_stack/providers/utils/docker/__init__.py b/llama_stack/providers/utils/docker/__init__.py deleted file mode 100644 index 756f351d8..000000000 --- a/llama_stack/providers/utils/docker/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. diff --git a/llama_stack/providers/utils/docker/service_config.py b/llama_stack/providers/utils/docker/service_config.py deleted file mode 100644 index b1f88eb5f..000000000 --- a/llama_stack/providers/utils/docker/service_config.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel - - -class LiteralString(str): - pass # Marker class for strings we want to format with > - - -class DockerComposeServiceConfig(BaseModel): - """Configuration for a single service in docker-compose.""" - - image: str - volumes: Optional[List[str]] = None - network_mode: str = "bridge" - ports: Optional[List[str]] = None - devices: Optional[List[str]] = None - environment: Optional[Dict[str, str]] = None - command: Optional[str] = None - depends_on: Optional[List[str]] = None - deploy: Optional[Dict[str, Any]] = None - runtime: Optional[str] = None - entrypoint: Optional[str] = None diff --git a/llama_stack/providers/utils/kvstore/config.py b/llama_stack/providers/utils/kvstore/config.py index de9f6d79b..63602ff7c 100644 --- a/llama_stack/providers/utils/kvstore/config.py +++ b/llama_stack/providers/utils/kvstore/config.py @@ -54,11 +54,11 @@ class SqliteKVStoreConfig(CommonConfig): ) @classmethod - def sample_run_config(cls, db_name: str = "kvstore.db"): + def sample_run_config(cls, dir: str = "runtime", db_name: str = "kvstore.db"): return { "type": "sqlite", "namespace": None, - "db_path": "${env.SQLITE_STORE_DIR:~/.llama/runtime/" + db_name + "}", + "db_path": "${env.SQLITE_STORE_DIR:~/.llama/" + f"{dir}/{db_name}" + "}", } diff --git a/llama_stack/templates/remote-vllm/doc_template.md b/llama_stack/templates/remote-vllm/doc_template.md new file mode 100644 index 000000000..b124ba5ea --- /dev/null +++ b/llama_stack/templates/remote-vllm/doc_template.md @@ -0,0 +1,95 @@ +# Remote vLLM Distribution + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations: + +{{ providers_table }} + +You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference. + +{%- if docker_compose_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in docker_compose_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + +{% if default_models %} +### Models + +The following models are configured by default: +{% for model in default_models %} +- `{{ model.model_id }}` +{% endfor %} +{% endif %} + +## Using Docker Compose + +You can use `docker compose` to start a vLLM container and Llama Stack server container together. +```bash +$ cd distributions/{{ name }}; docker compose up +``` + +You will see outputs similar to following --- +``` + +``` + +To kill the server +```bash +docker compose down +``` + +## Starting vLLM and Llama Stack separately + +You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack. + +#### Start vLLM server. + +```bash +docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model meta-llama/Llama-3.2-3B-Instruct +``` + +Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details. + + +#### Start Llama Stack server pointing to your vLLM server + + +We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following: +```yaml +inference: + - provider_id: vllm0 + provider_type: remote::vllm + config: + url: http://127.0.0.1:8000 +``` + +**Via Conda** + +If you are using Conda, you can build and run the Llama Stack server with the following commands: +```bash +cd distributions/remote-vllm +llama stack build --template remote_vllm --image-type conda +llama stack run run.yaml +``` + +**Via Docker** + +You can use the Llama Stack Docker image to start the server with the following command: +```bash +docker run --network host -it -p 5000:5000 \ + -v ~/.llama:/root/.llama \ + -v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \ + --gpus=all \ + llamastack/distribution-remote-vllm \ + --yaml_config /root/llamastack-run-remote-vllm.yaml +``` diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index c8ca05c6b..77f538175 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -9,7 +9,7 @@ from datetime import datetime from io import StringIO from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple import jinja2 import yaml @@ -22,7 +22,6 @@ from llama_stack.distribution.datatypes import ( Api, BuildConfig, DistributionSpec, - KVStoreConfig, ModelInput, Provider, ShieldInput, @@ -33,53 +32,26 @@ from llama_stack.distribution.utils.dynamic import instantiate_class_type from llama_stack.providers.remote.inference.vllm.config import ( VLLMInferenceAdapterConfig, ) -from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig +from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig -class DistributionTemplate(BaseModel): - """ - Represents a Llama Stack distribution instance that can generate configuration - and documentation files. - """ - - name: str - description: str - providers: Dict[str, List[str]] - run_config_overrides: Dict[str, List[Provider]] = Field(default_factory=dict) - compose_config_overrides: Dict[str, Dict[str, DockerComposeServiceConfig]] = Field( - default_factory=dict - ) - +class RunConfigSettings(BaseModel): + provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict) default_models: List[ModelInput] default_shields: Optional[List[ShieldInput]] = None - # Optional configuration - metadata_store: Optional[KVStoreConfig] = None - docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None - docker_image: Optional[str] = None - - @property - def distribution_spec(self) -> DistributionSpec: - return DistributionSpec( - description=self.description, - docker_image=self.docker_image, - providers=self.providers, - ) - - def build_config(self) -> BuildConfig: - return BuildConfig( - name=self.name, - distribution_spec=self.distribution_spec, - image_type="conda", # default to conda, can be overridden - ) - - def run_config(self) -> StackRunConfig: + def run_config( + self, + name: str, + providers: Dict[str, List[str]], + docker_image: Optional[str] = None, + ) -> StackRunConfig: provider_registry = get_provider_registry() provider_configs = {} - for api_str, provider_types in self.providers.items(): - if providers := self.run_config_overrides.get(api_str): - provider_configs[api_str] = providers + for api_str, provider_types in providers.items(): + if api_providers := self.provider_overrides.get(api_str): + provider_configs[api_str] = api_providers continue provider_type = provider_types[0] @@ -111,83 +83,53 @@ class DistributionTemplate(BaseModel): ] # Get unique set of APIs from providers - apis: Set[str] = set(self.providers.keys()) + apis: Set[str] = set(providers.keys()) return StackRunConfig( - image_name=self.name, - docker_image=self.docker_image, + image_name=name, + docker_image=docker_image, built_at=datetime.now(), apis=list(apis), providers=provider_configs, - metadata_store=self.metadata_store, + metadata_store=SqliteKVStoreConfig.sample_run_config( + dir=f"distributions/{name}", + db_name="registry.db", + ), models=self.default_models, shields=self.default_shields or [], ) - def docker_compose_config(self) -> Dict[str, Any]: - services = {} - provider_registry = get_provider_registry() - # Add provider services based on their sample_compose_config - for api_str, api_providers in self.providers.items(): - if overrides := self.compose_config_overrides.get(api_str): - services |= overrides - continue +class DistributionTemplate(BaseModel): + """ + Represents a Llama Stack distribution instance that can generate configuration + and documentation files. + """ - # only look at the first provider to get the compose config for now - # we may want to use `docker compose profiles` in the future - provider_type = api_providers[0] - provider_id = provider_type.split("::")[-1] - api = Api(api_str) - if provider_type not in provider_registry[api]: - raise ValueError( - f"Unknown provider type: {provider_type} for API: {api_str}" - ) + name: str + description: str - config_class = provider_registry[api][provider_type].config_class - assert ( - config_class is not None - ), f"No config class for provider type: {provider_type} for API: {api_str}" + providers: Dict[str, List[str]] + run_configs: Dict[str, RunConfigSettings] + template_path: Path - config_class = instantiate_class_type(config_class) - if not hasattr(config_class, "sample_docker_compose_config"): - continue + # Optional configuration + docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None + docker_image: Optional[str] = None - compose_config = config_class.sample_docker_compose_config() - services[provider_id] = compose_config + default_models: Optional[List[ModelInput]] = None - port = "${LLAMASTACK_PORT:-5001}" - # Add main llamastack service - llamastack_config = DockerComposeServiceConfig( - image=f"llamastack/distribution-{self.name}:latest", - depends_on=list(services.keys()), - volumes=[ - "~/.llama:/root/.llama", - f"~/local/llama-stack/distributions/{self.name}/run.yaml:/root/llamastack-run-{self.name}.yaml", - ], - ports=[f"{port}:{port}"], - environment={ - k: v[0] for k, v in (self.docker_compose_env_vars or {}).items() - }, - entrypoint=( - f'bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-{self.name}.yaml --port {port}"' + def build_config(self) -> BuildConfig: + return BuildConfig( + name=self.name, + distribution_spec=DistributionSpec( + description=self.description, + docker_image=self.docker_image, + providers=self.providers, ), - deploy={ - "restart_policy": { - "condition": "on-failure", - "delay": "3s", - "max_attempts": 5, - "window": "60s", - } - }, + image_type="conda", # default to conda, can be overridden ) - services["llamastack"] = llamastack_config - return { - "services": {k: v.model_dump() for k, v in services.items()}, - "volumes": {service_name: None for service_name in services.keys()}, - } - def generate_markdown_docs(self) -> str: """Generate markdown documentation using both Jinja2 templates and rich tables.""" # First generate the providers table using rich @@ -204,53 +146,7 @@ class DistributionTemplate(BaseModel): console.print(table) providers_table = output.getvalue() - # Main documentation template - template = """# {{ name }} Distribution - -{{ description }} - -## Provider Configuration - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations: - -{{ providers_table }} - -{%- if env_vars %} -## Environment Variables - -The following environment variables can be configured: - -{% for var, (value, description) in docker_compose_env_vars.items() %} -- `{{ var }}`: {{ description }} -{% endfor %} -{%- endif %} - -## Example Usage - -### Using Docker Compose - -```bash -$ cd distributions/{{ name }} -$ docker compose up -``` - -## Models - -The following models are configured by default: -{% for model in default_models %} -- `{{ model.model_id }}` -{% endfor %} - -{%- if default_shields %} - -## Safety Shields - -The following safety shields are configured: -{% for shield in default_shields %} -- `{{ shield.shield_id }}` -{%- endfor %} -{%- endif %} -""" + template = self.template_path.read_text() # Render template with rich-generated table env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True) template = env.from_string(template) @@ -261,7 +157,6 @@ The following safety shields are configured: providers_table=providers_table, docker_compose_env_vars=self.docker_compose_env_vars, default_models=self.default_models, - default_shields=self.default_shields, ) def save_distribution(self, output_dir: Path) -> None: @@ -271,19 +166,14 @@ The following safety shields are configured: with open(output_dir / "build.yaml", "w") as f: yaml.safe_dump(build_config.model_dump(), f, sort_keys=False) - run_config = self.run_config() - serialized = run_config.model_dump() - with open(output_dir / "run.yaml", "w") as f: - yaml.safe_dump(serialized, f, sort_keys=False) - - # serialized_str = yaml.dump(serialized, sort_keys=False) - # env_vars = set() - # for match in re.finditer(r"\${env\.([A-Za-z0-9_-]+)}", serialized_str): - # env_vars.add(match.group(1)) - - docker_compose = self.docker_compose_config() - with open(output_dir / "compose.yaml", "w") as f: - yaml.safe_dump(docker_compose, f, sort_keys=False, default_flow_style=False) + for yaml_pth, settings in self.run_configs.items(): + print(f"Generating {yaml_pth}") + print(f"Providers: {self.providers}") + run_config = settings.run_config( + self.name, self.providers, self.docker_image + ) + with open(output_dir / yaml_pth, "w") as f: + yaml.safe_dump(run_config.model_dump(), f, sort_keys=False) docs = self.generate_markdown_docs() with open(output_dir / f"{self.name}.md", "w") as f: @@ -291,87 +181,89 @@ The following safety shields are configured: @classmethod def vllm_distribution(cls) -> "DistributionTemplate": + providers = { + "inference": ["remote::vllm"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="vllm-inference", + provider_type="remote::vllm", + config=VLLMInferenceAdapterConfig.sample_run_config( + url="${env.VLLM_URL}", + ), + ) + + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="vllm-inference", + ) + safety_model = ModelInput( + model_id="${env.SAFETY_MODEL}", + provider_id="vllm-safety", + ) + return cls( name="remote-vllm", description="Use (an external) vLLM server for running LLM inference", - providers={ - "inference": ["remote::vllm"], - "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - }, - run_config_overrides={ - "inference": [ - Provider( - provider_id="vllm-0", - provider_type="remote::vllm", - config=VLLMInferenceAdapterConfig.sample_run_config( - url="${env.VLLM_URL:http://host.docker.internal:5100/v1}", - ), - ), - Provider( - provider_id="vllm-1", - provider_type="remote::vllm", - config=VLLMInferenceAdapterConfig.sample_run_config( - url="${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}", - ), - ), - ] - }, - compose_config_overrides={ - "inference": { - "vllm-0": VLLMInferenceAdapterConfig.sample_docker_compose_config( - port=5100, - cuda_visible_devices="0", - model="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}", - ), - "vllm-1": VLLMInferenceAdapterConfig.sample_docker_compose_config( - port=5100, - cuda_visible_devices="1", - model="${env.SAFETY_MODEL:Llama-Guard-3-1B}", - ), - } - }, - default_models=[ - ModelInput( - model_id="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}", - provider_id="vllm-0", + template_path=Path(__file__).parent / "remote-vllm" / "doc_template.md", + providers=providers, + default_models=[inference_model, safety_model], + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=[inference_model], ), - ModelInput( - model_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}", - provider_id="vllm-1", + "safety-run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [ + inference_provider, + Provider( + provider_id="vllm-safety", + provider_type="remote::vllm", + config=VLLMInferenceAdapterConfig.sample_run_config( + url="${env.SAFETY_VLLM_URL}", + ), + ), + ], + }, + default_models=[ + inference_model, + safety_model, + ], + default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], ), - ], - default_shields=[ - ShieldInput(shield_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}") - ], + }, docker_compose_env_vars={ - # these defaults are for the Docker Compose configuration - "VLLM_URL": ( - "http://host.docker.internal:${VLLM_PORT:-5100}/v1", - "URL of the vLLM server with the main inference model", - ), - "SAFETY_VLLM_URL": ( - "http://host.docker.internal:${SAFETY_VLLM_PORT:-5101}/v1", - "URL of the vLLM server with the safety model", - ), - "MAX_TOKENS": ( - "${MAX_TOKENS:-4096}", - "Maximum number of tokens for generation", + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", ), "INFERENCE_MODEL": ( - "${INFERENCE_MODEL:-Llama3.2-3B-Instruct}", - "Name of the inference model to use", + "meta-llama/Llama-3.2-3B-Instruct", + "Inference model loaded into the vLLM server", + ), + "VLLM_URL": ( + "http://host.docker.internal:5100}/v1", + "URL of the vLLM server with the main inference model", + ), + "MAX_TOKENS": ( + "4096", + "Maximum number of tokens for generation", + ), + "SAFETY_VLLM_URL": ( + "http://host.docker.internal:5101/v1", + "URL of the vLLM server with the safety model", ), "SAFETY_MODEL": ( - "${SAFETY_MODEL:-Llama-Guard-3-1B}", + "meta-llama/Llama-Guard-3-1B", "Name of the safety (Llama-Guard) model to use", ), - "LLAMASTACK_PORT": ( - "${LLAMASTACK_PORT:-5001}", - "Port for the Llama Stack distribution server", - ), }, )