Adding docker-compose.yaml, starting to simplify

2025-08-01 16:24:44 +00:00 · 2024-11-16 10:56:38 -08:00 · 2024-11-16 10:56:38 -08:00 · f38e76ee98
commit f38e76ee98
parent e4509cb568
14 changed files with 516 additions and 386 deletions
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -9,25 +9,30 @@
 # Similarly change "host.docker.internal" to "localhost" in the run.yaml file
 #
 services:
-  vllm-0:
+  vllm-inference:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-       - "5100:5100"
+       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=0
+      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-3.1-8B-Instruct
+      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5100
+      --port ${VLLM_INFERENCE_PORT:-5100}
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 5
    deploy:
      resources:
        reservations:
@ -35,25 +40,34 @@ services:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
-  vllm-1:
+
  # A little trick:
  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
  vllm-${VLLM_SAFETY_MODEL:+safety}:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-      - "5101:5101"
+      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=1
+      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-Guard-3-1B
+      --model ${VLLM_SAFETY_MODEL}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5101
+      --port ${VLLM_SAFETY_PORT:-5101}
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 5
    deploy:
      resources:
        reservations:
@ -63,23 +77,25 @@ services:
    runtime: nvidia
  llamastack:
    depends_on:
-    - vllm-0
+      - vllm-inference:
-    - vllm-1
+          condition: service_healthy
-      # image: llamastack/distribution-remote-vllm
+      - vllm-${VLLM_SAFETY_MODEL:+safety}:
          condition: service_healthy
    # image: llamastack/distribution-remote-vllm
    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
    volumes:
      - ~/.llama:/root/.llama
-      - ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
+      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    environment:
-      - LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1}
+      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
-      - LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct}
+      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      - MAX_TOKENS=${MAX_TOKENS:-4096}
      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
-      - LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
      - LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
    ports:
-      - "5001:5001"
+      - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
    # Hack: wait for vLLM server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
    deploy:
@ -89,6 +105,6 @@ services:
        max_attempts: 5
        window: 60s
 volumes:
-  vllm-0:
+  vllm-inference:
-  vllm-1:
+  vllm-safety:
  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -0,0 +1,68 @@
 version: '2'
 built_at: '2024-11-11T20:09:45.988375'
 image_name: remote-vllm
 docker_image: remote-vllm
 conda_env: null
 apis:
 - inference
 - memory
 - safety
 - agents
 - telemetry
 providers:
  inference:
  # serves main inference model
  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
      # NOTE: replace with "localhost" if you are running in "host" network mode
      url: ${env.VLLM_URL}
      max_tokens: ${env.MAX_TOKENS:4096}
      api_token: fake
  # serves safety llama_guard model
  - provider_id: vllm-safety
    provider_type: remote::vllm
    config:
      # NOTE: replace with "localhost" if you are running in "host" network mode
      url: ${env.SAFETY_VLLM_URL}
      max_tokens: ${env.MAX_TOKENS:4096}
      api_token: fake
  memory:
  - provider_id: faiss-0
    provider_type: inline::faiss
    config:
      kvstore:
        namespace: null
        type: sqlite
        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  memory:
  - provider_id: meta0
    provider_type: inline::faiss
    config: {}
  agents:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db"
  telemetry:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
 models:
  - model_id: ${env.INFERENCE_MODEL}
    provider_id: vllm-inference
  - model_id: ${env.SAFETY_MODEL}
    provider_id: vllm-safety
 shields:
  - shield_id: ${env.SAFETY_MODEL}
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -6,39 +6,25 @@ conda_env: null
 apis:
 - inference
 - memory
 - safety
 - agents
 - telemetry
 providers:
  inference:
  # serves main inference model
-  - provider_id: vllm-0
+  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
+      url: ${env.VLLM_URL}
      url: ${env.VLLM_URL:http://host.docker.internal:5100/v1}
      max_tokens: ${env.MAX_TOKENS:4096}
      api_token: fake
  # serves safety llama_guard model
  - provider_id: vllm-1
    provider_type: remote::vllm
    config:
      # NOTE: replace with "localhost" if you are running in "host" network mode
      url: ${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
      max_tokens: ${env.MAX_TOKENS:4096}
      api_token: fake
  memory:
-  - provider_id: faiss-0
+  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        namespace: null
        type: sqlite
        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  memory:
  - provider_id: meta0
    provider_type: inline::faiss
@ -60,9 +46,5 @@ metadata_store:
  type: sqlite
  db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
 models:
-  - model_id: ${env.INFERENCE_MODEL:Llama3.1-8B-Instruct}
+  - model_id: ${env.INFERENCE_MODEL}
-    provider_id: vllm-0
+    provider_id: vllm-inference
  - model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
    provider_id: vllm-1
 shields:
  - shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
--- a/llama_stack/providers/remote/inference/ollama/init.py
+++ b/llama_stack/providers/remote/inference/ollama/init.py
@ -4,37 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Optional
+from .config import OllamaImplConfig
 from llama_stack.distribution.datatypes import RemoteProviderConfig
 from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
-DEFAULT_OLLAMA_PORT = 11434
+async def get_adapter_impl(config: OllamaImplConfig, _deps):
 class OllamaImplConfig(RemoteProviderConfig):
    port: int = DEFAULT_OLLAMA_PORT
    @classmethod
    def sample_docker_compose_config(cls) -> Optional[DockerComposeServiceConfig]:
        return DockerComposeServiceConfig(
            image="ollama/ollama:latest",
            volumes=["$HOME/.ollama:/root/.ollama"],
            devices=["nvidia.com/gpu=all"],
            deploy={
                "resources": {
                    "reservations": {
                        "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
                    }
                }
            },
            runtime="nvidia",
            ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
        )
 async def get_adapter_impl(config: RemoteProviderConfig, _deps):
    from .ollama import OllamaInferenceAdapter
    impl = OllamaInferenceAdapter(config.url)
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -0,0 +1,65 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import List
 from llama_stack.distribution.datatypes import RemoteProviderConfig
 from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
 DEFAULT_OLLAMA_PORT = 11434
 class OllamaImplConfig(RemoteProviderConfig):
    port: int = DEFAULT_OLLAMA_PORT
    @classmethod
    def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]:
        return [
            DockerComposeServiceConfig(
                service_name="ollama",
                image="ollama/ollama:latest",
                volumes=["$HOME/.ollama:/root/.ollama"],
                devices=["nvidia.com/gpu=all"],
                deploy={
                    "resources": {
                        "reservations": {
                            "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
                        }
                    }
                },
                runtime="nvidia",
                ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
                healthcheck={
                    "test": ["CMD", "curl", "-f", "http://ollama:11434"],
                    "interval": "10s",
                    "timeout": "5s",
                    "retries": 5,
                },
            ),
            DockerComposeServiceConfig(
                service_name="ollama-init",
                image="ollama/ollama",
                depends_on={"ollama": {"condition": "service_healthy"}},
                environment={
                    "OLLAMA_HOST": "ollama",
                    "OLLAMA_MODELS": "${OLLAMA_MODELS}",
                },
                volumes=["ollama_data:/root/.ollama"],
                entrypoint=(
                    'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";'
                    "until curl -s http://ollama:11434 > /dev/null; do"
                    "attempt=$((attempt + 1));"
                    "if [ $attempt -ge $max_attempts ]; then"
                    'echo "Timeout waiting for Ollama server";'
                    "exit 1;"
                    "fi;"
                    'echo "Attempt $attempt: Server not ready yet...";'
                    "sleep 5;"
                    "done'"
                ),
            ),
        ]
--- a/llama_stack/providers/remote/inference/ollama/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/ollama/docker_compose.yaml
@ -0,0 +1,55 @@
 services:
  ${SERVICE_NAME:-ollama}:
    image: ollama/ollama:latest
    ports:
      - ${OLLAMA_PORT:-11434}:${OLLAMA_PORT:-11434}
    volumes:
      - $HOME/.ollama:/root/.ollama
    devices:
      - nvidia.com/gpu=all
    runtime: nvidia
    healthcheck:
      test: ["CMD", "curl", "-f", "http://ollama:11434"]
      interval: 10s
      timeout: 5s
      retries: 5
  ${SERVICE_NAME:-ollama}-init:
    image: ollama/ollama
    depends_on:
      - ${SERVICE_NAME:-ollama}:
          condition: service_healthy
    environment:
      - OLLAMA_HOST=ollama
      - OLLAMA_MODELS=${OLLAMA_MODELS}
    volumes:
      - $HOME/.ollama:/root/.ollama
    entrypoint: >
      sh -c '
        max_attempts=30;
        attempt=0;
        echo "Waiting for Ollama server...";
        until curl -s http://ollama:11434 > /dev/null; do
          attempt=$((attempt + 1));
          if [ $attempt -ge $max_attempts ]; then
            echo "Timeout waiting for Ollama server";
            exit 1;
          fi;
          echo "Attempt $attempt: Server not ready yet...";
          sleep 5;
        done;
        echo "Server ready. Pulling models...";
        models="${OLLAMA_MODELS}";
        for model in $models; do
          echo "Pulling $model...";
          if ! ollama pull "$model"; then
            echo "Failed to pull $model";
            exit 1;
          fi;
        done;
        echo "All models pulled successfully"
      '
--- a/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
@ -0,0 +1,35 @@
 services:
  ${SERVICE_NAME:-tgi}:
    image: ghcr.io/huggingface/text-generation-inference:2.3.1
    network_mode: "host"
    volumes:
      - $HOME/.cache/huggingface:/data
    ports:
      - ${TGI_PORT:-8000}:${TGI_PORT:-8000}
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
      - HF_HOME=/data
      - HF_DATASETS_CACHE=/data
      - HF_MODULES_CACHE=/data
      - HF_HUB_CACHE=/data
    command: >
      --dtype bfloat16
      --usage-stats off
      --sharded false
      --model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --port ${TGI_PORT:-8000}
      --cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
    healthcheck:
      test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
      interval: 5s
      timeout: 5s
      retries: 30
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -9,11 +9,6 @@ from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
 DEFAULT_VLLM_PORT = 8000
@json_schema_type
 class VLLMInferenceAdapterConfig(BaseModel):
@ -33,48 +28,10 @@ class VLLMInferenceAdapterConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}",
+        url: str = "${env.VLLM_URL}",
    ):
        return {
            "url": url,
            "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
            "api_token": "${env.VLLM_API_TOKEN:fake}",
        }
    @classmethod
    def sample_docker_compose_config(
        cls,
        port: int = DEFAULT_VLLM_PORT,
        cuda_visible_devices: str = "0",
        model: str = "meta-llama/Llama-3.2-3B-Instruct",
    ) -> Optional[DockerComposeServiceConfig]:
        return DockerComposeServiceConfig(
            image="vllm/vllm-openai:latest",
            volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"],
            devices=["nvidia.com/gpu=all"],
            deploy={
                "resources": {
                    "reservations": {
                        "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
                    }
                }
            },
            runtime="nvidia",
            ports=[f"{port}:{port}"],
            environment={
                "CUDA_VISIBLE_DEVICES": cuda_visible_devices,
                "HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN",
            },
            command=(
                " ".join(
                    [
                        "--gpu-memory-utilization 0.75",
                        f"--model {model}",
                        "--enforce-eager",
                        "--max-model-len 8192",
                        "--max-num-seqs 16",
                        f"--port {port}",
                    ]
                )
            ),
        )
--- a/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
@ -0,0 +1,26 @@
 services:
  ${SERVICE_NAME:-vllm}:
    image: vllm/vllm-openai:latest
    ports:
      - ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
    devices:
      - nvidia.com/gpu=all
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]
    runtime: nvidia
    environment:
      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
      - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
    command: >
      --gpu-memory-utilization 0.75
      --model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
      --port ${VLLM_PORT:-5100}
--- a/llama_stack/providers/utils/docker/init.py
+++ b/llama_stack/providers/utils/docker/init.py
@ -1,5 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
--- a/llama_stack/providers/utils/docker/service_config.py
+++ b/llama_stack/providers/utils/docker/service_config.py
@ -1,29 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from typing import Any, Dict, List, Optional
 from pydantic import BaseModel
 class LiteralString(str):
    pass  # Marker class for strings we want to format with >
 class DockerComposeServiceConfig(BaseModel):
    """Configuration for a single service in docker-compose."""
    image: str
    volumes: Optional[List[str]] = None
    network_mode: str = "bridge"
    ports: Optional[List[str]] = None
    devices: Optional[List[str]] = None
    environment: Optional[Dict[str, str]] = None
    command: Optional[str] = None
    depends_on: Optional[List[str]] = None
    deploy: Optional[Dict[str, Any]] = None
    runtime: Optional[str] = None
    entrypoint: Optional[str] = None
--- a/llama_stack/providers/utils/kvstore/config.py
+++ b/llama_stack/providers/utils/kvstore/config.py
@ -54,11 +54,11 @@ class SqliteKVStoreConfig(CommonConfig):
    )
    @classmethod
-    def sample_run_config(cls, db_name: str = "kvstore.db"):
+    def sample_run_config(cls, dir: str = "runtime", db_name: str = "kvstore.db"):
        return {
            "type": "sqlite",
            "namespace": None,
-            "db_path": "${env.SQLITE_STORE_DIR:~/.llama/runtime/" + db_name + "}",
+            "db_path": "${env.SQLITE_STORE_DIR:~/.llama/" + f"{dir}/{db_name}" + "}",
        }
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@ -0,0 +1,95 @@
 # Remote vLLM Distribution
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
 {{ providers_table }}
 You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
 {%- if docker_compose_env_vars %}
 ### Environment Variables
 The following environment variables can be configured:
 {% for var, (default_value, description) in docker_compose_env_vars.items() %}
 - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
 {% endfor %}
 {% endif %}
 {% if default_models %}
 ### Models
 The following models are configured by default:
 {% for model in default_models %}
 - `{{ model.model_id }}`
 {% endfor %}
 {% endif %}
 ## Using Docker Compose
 You can use `docker compose` to start a vLLM container and Llama Stack server container together.
 ```bash
 $ cd distributions/{{ name }}; docker compose up
 ```
 You will see outputs similar to following ---
 ```
 <TO BE FILLED>
 ```
 To kill the server
 ```bash
 docker compose down
 ```
 ## Starting vLLM and Llama Stack separately
 You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
 #### Start vLLM server.
 ```bash
 docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
    --model meta-llama/Llama-3.2-3B-Instruct
 ```
 Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
 #### Start Llama Stack server pointing to your vLLM server
 We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following:
 ```yaml
 inference:
  - provider_id: vllm0
    provider_type: remote::vllm
    config:
      url: http://127.0.0.1:8000
 ```
 **Via Conda**
 If you are using Conda, you can build and run the Llama Stack server with the following commands:
 ```bash
 cd distributions/remote-vllm
 llama stack build --template remote_vllm --image-type conda
 llama stack run run.yaml
 ```
 **Via Docker**
 You can use the Llama Stack Docker image to start the server with the following command:
 ```bash
 docker run --network host -it -p 5000:5000 \
  -v ~/.llama:/root/.llama \
  -v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \
  --gpus=all \
  llamastack/distribution-remote-vllm \
  --yaml_config /root/llamastack-run-remote-vllm.yaml
 ```
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -9,7 +9,7 @@ from datetime import datetime
 from io import StringIO
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 import jinja2
 import yaml
@ -22,7 +22,6 @@ from llama_stack.distribution.datatypes import (
    Api,
    BuildConfig,
    DistributionSpec,
    KVStoreConfig,
    ModelInput,
    Provider,
    ShieldInput,
@ -33,53 +32,26 @@ from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.providers.remote.inference.vllm.config import (
    VLLMInferenceAdapterConfig,
 )
-from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
-class DistributionTemplate(BaseModel):
+class RunConfigSettings(BaseModel):
-    """
+    provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
    Represents a Llama Stack distribution instance that can generate configuration
    and documentation files.
    """
    name: str
    description: str
    providers: Dict[str, List[str]]
    run_config_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
    compose_config_overrides: Dict[str, Dict[str, DockerComposeServiceConfig]] = Field(
        default_factory=dict
    )
    default_models: List[ModelInput]
    default_shields: Optional[List[ShieldInput]] = None
-    # Optional configuration
+    def run_config(
-    metadata_store: Optional[KVStoreConfig] = None
+        self,
-    docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
+        name: str,
-    docker_image: Optional[str] = None
+        providers: Dict[str, List[str]],
-
+        docker_image: Optional[str] = None,
-    @property
+    ) -> StackRunConfig:
    def distribution_spec(self) -> DistributionSpec:
        return DistributionSpec(
            description=self.description,
            docker_image=self.docker_image,
            providers=self.providers,
        )
    def build_config(self) -> BuildConfig:
        return BuildConfig(
            name=self.name,
            distribution_spec=self.distribution_spec,
            image_type="conda",  # default to conda, can be overridden
        )
    def run_config(self) -> StackRunConfig:
        provider_registry = get_provider_registry()
        provider_configs = {}
-        for api_str, provider_types in self.providers.items():
+        for api_str, provider_types in providers.items():
-            if providers := self.run_config_overrides.get(api_str):
+            if api_providers := self.provider_overrides.get(api_str):
-                provider_configs[api_str] = providers
+                provider_configs[api_str] = api_providers
                continue
            provider_type = provider_types[0]
@ -111,83 +83,53 @@ class DistributionTemplate(BaseModel):
            ]
        # Get unique set of APIs from providers
-        apis: Set[str] = set(self.providers.keys())
+        apis: Set[str] = set(providers.keys())
        return StackRunConfig(
-            image_name=self.name,
+            image_name=name,
-            docker_image=self.docker_image,
+            docker_image=docker_image,
            built_at=datetime.now(),
            apis=list(apis),
            providers=provider_configs,
-            metadata_store=self.metadata_store,
+            metadata_store=SqliteKVStoreConfig.sample_run_config(
                dir=f"distributions/{name}",
                db_name="registry.db",
            ),
            models=self.default_models,
            shields=self.default_shields or [],
        )
    def docker_compose_config(self) -> Dict[str, Any]:
        services = {}
        provider_registry = get_provider_registry()
-        # Add provider services based on their sample_compose_config
+class DistributionTemplate(BaseModel):
-        for api_str, api_providers in self.providers.items():
+    """
-            if overrides := self.compose_config_overrides.get(api_str):
+    Represents a Llama Stack distribution instance that can generate configuration
-                services |= overrides
+    and documentation files.
-                continue
+    """
-            # only look at the first provider to get the compose config for now
+    name: str
-            # we may want to use `docker compose profiles` in the future
+    description: str
            provider_type = api_providers[0]
            provider_id = provider_type.split("::")[-1]
            api = Api(api_str)
            if provider_type not in provider_registry[api]:
                raise ValueError(
                    f"Unknown provider type: {provider_type} for API: {api_str}"
                )
-            config_class = provider_registry[api][provider_type].config_class
+    providers: Dict[str, List[str]]
-            assert (
+    run_configs: Dict[str, RunConfigSettings]
-                config_class is not None
+    template_path: Path
            ), f"No config class for provider type: {provider_type} for API: {api_str}"
-            config_class = instantiate_class_type(config_class)
+    # Optional configuration
-            if not hasattr(config_class, "sample_docker_compose_config"):
+    docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
-                continue
+    docker_image: Optional[str] = None
-            compose_config = config_class.sample_docker_compose_config()
+    default_models: Optional[List[ModelInput]] = None
            services[provider_id] = compose_config
-        port = "${LLAMASTACK_PORT:-5001}"
+    def build_config(self) -> BuildConfig:
-        # Add main llamastack service
+        return BuildConfig(
-        llamastack_config = DockerComposeServiceConfig(
+            name=self.name,
-            image=f"llamastack/distribution-{self.name}:latest",
+            distribution_spec=DistributionSpec(
-            depends_on=list(services.keys()),
+                description=self.description,
-            volumes=[
+                docker_image=self.docker_image,
-                "~/.llama:/root/.llama",
+                providers=self.providers,
                f"~/local/llama-stack/distributions/{self.name}/run.yaml:/root/llamastack-run-{self.name}.yaml",
            ],
            ports=[f"{port}:{port}"],
            environment={
                k: v[0] for k, v in (self.docker_compose_env_vars or {}).items()
            },
            entrypoint=(
                f'bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-{self.name}.yaml --port {port}"'
            ),
-            deploy={
+            image_type="conda",  # default to conda, can be overridden
                "restart_policy": {
                    "condition": "on-failure",
                    "delay": "3s",
                    "max_attempts": 5,
                    "window": "60s",
                }
            },
        )
        services["llamastack"] = llamastack_config
        return {
            "services": {k: v.model_dump() for k, v in services.items()},
            "volumes": {service_name: None for service_name in services.keys()},
        }
    def generate_markdown_docs(self) -> str:
        """Generate markdown documentation using both Jinja2 templates and rich tables."""
        # First generate the providers table using rich
@ -204,53 +146,7 @@ class DistributionTemplate(BaseModel):
        console.print(table)
        providers_table = output.getvalue()
-        # Main documentation template
+        template = self.template_path.read_text()
        template = """# {{ name }} Distribution
 {{ description }}
 ## Provider Configuration
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
 {{ providers_table }}
 {%- if env_vars %}
 ## Environment Variables
 The following environment variables can be configured:
 {% for var, (value, description) in docker_compose_env_vars.items() %}
 - `{{ var }}`: {{ description }}
 {% endfor %}
 {%- endif %}
 ## Example Usage
 ### Using Docker Compose
 ```bash
 $ cd distributions/{{ name }}
 $ docker compose up
 ```
 ## Models
 The following models are configured by default:
 {% for model in default_models %}
 - `{{ model.model_id }}`
 {% endfor %}
 {%- if default_shields %}
 ## Safety Shields
 The following safety shields are configured:
 {% for shield in default_shields %}
 - `{{ shield.shield_id }}`
 {%- endfor %}
 {%- endif %}
 """
        # Render template with rich-generated table
        env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True)
        template = env.from_string(template)
@ -261,7 +157,6 @@ The following safety shields are configured:
            providers_table=providers_table,
            docker_compose_env_vars=self.docker_compose_env_vars,
            default_models=self.default_models,
            default_shields=self.default_shields,
        )
    def save_distribution(self, output_dir: Path) -> None:
@ -271,19 +166,14 @@ The following safety shields are configured:
        with open(output_dir / "build.yaml", "w") as f:
            yaml.safe_dump(build_config.model_dump(), f, sort_keys=False)
-        run_config = self.run_config()
+        for yaml_pth, settings in self.run_configs.items():
-        serialized = run_config.model_dump()
+            print(f"Generating {yaml_pth}")
-        with open(output_dir / "run.yaml", "w") as f:
+            print(f"Providers: {self.providers}")
-            yaml.safe_dump(serialized, f, sort_keys=False)
+            run_config = settings.run_config(
-
+                self.name, self.providers, self.docker_image
-        # serialized_str = yaml.dump(serialized, sort_keys=False)
+            )
-        # env_vars = set()
+            with open(output_dir / yaml_pth, "w") as f:
-        # for match in re.finditer(r"\${env\.([A-Za-z0-9_-]+)}", serialized_str):
+                yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
        #     env_vars.add(match.group(1))
        docker_compose = self.docker_compose_config()
        with open(output_dir / "compose.yaml", "w") as f:
            yaml.safe_dump(docker_compose, f, sort_keys=False, default_flow_style=False)
        docs = self.generate_markdown_docs()
        with open(output_dir / f"{self.name}.md", "w") as f:
@ -291,87 +181,89 @@ The following safety shields are configured:
    @classmethod
    def vllm_distribution(cls) -> "DistributionTemplate":
        providers = {
            "inference": ["remote::vllm"],
            "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
            "safety": ["inline::llama-guard"],
            "agents": ["inline::meta-reference"],
            "telemetry": ["inline::meta-reference"],
        }
        inference_provider = Provider(
            provider_id="vllm-inference",
            provider_type="remote::vllm",
            config=VLLMInferenceAdapterConfig.sample_run_config(
                url="${env.VLLM_URL}",
            ),
        )
        inference_model = ModelInput(
            model_id="${env.INFERENCE_MODEL}",
            provider_id="vllm-inference",
        )
        safety_model = ModelInput(
            model_id="${env.SAFETY_MODEL}",
            provider_id="vllm-safety",
        )
        return cls(
            name="remote-vllm",
            description="Use (an external) vLLM server for running LLM inference",
-            providers={
+            template_path=Path(__file__).parent / "remote-vllm" / "doc_template.md",
-                "inference": ["remote::vllm"],
+            providers=providers,
-                "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+            default_models=[inference_model, safety_model],
-                "safety": ["inline::llama-guard"],
+            run_configs={
-                "agents": ["inline::meta-reference"],
+                "run.yaml": RunConfigSettings(
-                "telemetry": ["inline::meta-reference"],
+                    provider_overrides={
-            },
+                        "inference": [inference_provider],
-            run_config_overrides={
+                    },
-                "inference": [
+                    default_models=[inference_model],
                    Provider(
                        provider_id="vllm-0",
                        provider_type="remote::vllm",
                        config=VLLMInferenceAdapterConfig.sample_run_config(
                            url="${env.VLLM_URL:http://host.docker.internal:5100/v1}",
                        ),
                    ),
                    Provider(
                        provider_id="vllm-1",
                        provider_type="remote::vllm",
                        config=VLLMInferenceAdapterConfig.sample_run_config(
                            url="${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}",
                        ),
                    ),
                ]
            },
            compose_config_overrides={
                "inference": {
                    "vllm-0": VLLMInferenceAdapterConfig.sample_docker_compose_config(
                        port=5100,
                        cuda_visible_devices="0",
                        model="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
                    ),
                    "vllm-1": VLLMInferenceAdapterConfig.sample_docker_compose_config(
                        port=5100,
                        cuda_visible_devices="1",
                        model="${env.SAFETY_MODEL:Llama-Guard-3-1B}",
                    ),
                }
            },
            default_models=[
                ModelInput(
                    model_id="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
                    provider_id="vllm-0",
                ),
-                ModelInput(
+                "safety-run.yaml": RunConfigSettings(
-                    model_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}",
+                    provider_overrides={
-                    provider_id="vllm-1",
+                        "inference": [
                            inference_provider,
                            Provider(
                                provider_id="vllm-safety",
                                provider_type="remote::vllm",
                                config=VLLMInferenceAdapterConfig.sample_run_config(
                                    url="${env.SAFETY_VLLM_URL}",
                                ),
                            ),
                        ],
                    },
                    default_models=[
                        inference_model,
                        safety_model,
                    ],
                    default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
                ),
-            ],
+            },
            default_shields=[
                ShieldInput(shield_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}")
            ],
            docker_compose_env_vars={
-                # these defaults are for the Docker Compose configuration
+                "LLAMASTACK_PORT": (
-                "VLLM_URL": (
+                    "5001",
-                    "http://host.docker.internal:${VLLM_PORT:-5100}/v1",
+                    "Port for the Llama Stack distribution server",
                    "URL of the vLLM server with the main inference model",
                ),
                "SAFETY_VLLM_URL": (
                    "http://host.docker.internal:${SAFETY_VLLM_PORT:-5101}/v1",
                    "URL of the vLLM server with the safety model",
                ),
                "MAX_TOKENS": (
                    "${MAX_TOKENS:-4096}",
                    "Maximum number of tokens for generation",
                ),
                "INFERENCE_MODEL": (
-                    "${INFERENCE_MODEL:-Llama3.2-3B-Instruct}",
+                    "meta-llama/Llama-3.2-3B-Instruct",
-                    "Name of the inference model to use",
+                    "Inference model loaded into the vLLM server",
                ),
                "VLLM_URL": (
                    "http://host.docker.internal:5100}/v1",
                    "URL of the vLLM server with the main inference model",
                ),
                "MAX_TOKENS": (
                    "4096",
                    "Maximum number of tokens for generation",
                ),
                "SAFETY_VLLM_URL": (
                    "http://host.docker.internal:5101/v1",
                    "URL of the vLLM server with the safety model",
                ),
                "SAFETY_MODEL": (
-                    "${SAFETY_MODEL:-Llama-Guard-3-1B}",
+                    "meta-llama/Llama-Guard-3-1B",
                    "Name of the safety (Llama-Guard) model to use",
                ),
                "LLAMASTACK_PORT": (
                    "${LLAMASTACK_PORT:-5001}",
                    "Port for the Llama Stack distribution server",
                ),
            },
        )