Adding docker-compose.yaml, starting to simplify

2025-12-16 14:32:38 +00:00 · 2024-11-16 10:56:38 -08:00 · 2024-11-16 10:56:38 -08:00 · f38e76ee98
commit f38e76ee98
parent e4509cb568
14 changed files with 516 additions and 386 deletions
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -9,25 +9,30 @@
 # Similarly change "host.docker.internal" to "localhost" in the run.yaml file
 #
 services:
-  vllm-0:
+  vllm-inference:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-       - "5100:5100"
+       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=0
+      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-3.1-8B-Instruct
+      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5100
+      --port ${VLLM_INFERENCE_PORT:-5100}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
    deploy:
      resources:
        reservations:
@ -35,25 +40,34 @@ services:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
-  vllm-1:
+
+  # A little trick:
+  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
+  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
+  vllm-${VLLM_SAFETY_MODEL:+safety}:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-      - "5101:5101"
+      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=1
+      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-Guard-3-1B
+      --model ${VLLM_SAFETY_MODEL}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5101
+      --port ${VLLM_SAFETY_PORT:-5101}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
    deploy:
      resources:
        reservations:
@ -63,23 +77,25 @@ services:
    runtime: nvidia
  llamastack:
    depends_on:
-    - vllm-0
-    - vllm-1
-      # image: llamastack/distribution-remote-vllm
+      - vllm-inference:
+          condition: service_healthy
+      - vllm-${VLLM_SAFETY_MODEL:+safety}:
+          condition: service_healthy
+    # image: llamastack/distribution-remote-vllm
    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
    volumes:
      - ~/.llama:/root/.llama
-      - ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
-    # network_mode: "host"
+      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
+    network_mode: ${NETWORK_MODE:-bridged}
    environment:
-      - LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1}
-      - LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct}
+      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
+      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      - MAX_TOKENS=${MAX_TOKENS:-4096}
      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
-      - LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1}
-      - LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
    ports:
-      - "5001:5001"
+      - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
    # Hack: wait for vLLM server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
    deploy:
@ -89,6 +105,6 @@ services:
        max_attempts: 5
        window: 60s
 volumes:
-  vllm-0:
-  vllm-1:
+  vllm-inference:
+  vllm-safety:
  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -0,0 +1,68 @@
+version: '2'
+built_at: '2024-11-11T20:09:45.988375'
+image_name: remote-vllm
+docker_image: remote-vllm
+conda_env: null
+apis:
+- inference
+- memory
+- safety
+- agents
+- telemetry
+providers:
+  inference:
+  # serves main inference model
+  - provider_id: vllm-inference
+    provider_type: remote::vllm
+    config:
+      # NOTE: replace with "localhost" if you are running in "host" network mode
+      url: ${env.VLLM_URL}
+      max_tokens: ${env.MAX_TOKENS:4096}
+      api_token: fake
+  # serves safety llama_guard model
+  - provider_id: vllm-safety
+    provider_type: remote::vllm
+    config:
+      # NOTE: replace with "localhost" if you are running in "host" network mode
+      url: ${env.SAFETY_VLLM_URL}
+      max_tokens: ${env.MAX_TOKENS:4096}
+      api_token: fake
+  memory:
+  - provider_id: faiss-0
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        namespace: null
+        type: sqlite
+        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  memory:
+  - provider_id: meta0
+    provider_type: inline::faiss
+    config: {}
+  agents:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db"
+  telemetry:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
+models:
+  - model_id: ${env.INFERENCE_MODEL}
+    provider_id: vllm-inference
+  - model_id: ${env.SAFETY_MODEL}
+    provider_id: vllm-safety
+shields:
+  - shield_id: ${env.SAFETY_MODEL}
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -6,39 +6,25 @@ conda_env: null
 apis:
 - inference
 - memory
- safety
 - agents
 - telemetry
 providers:
  inference:
  # serves main inference model
-  - provider_id: vllm-0
+  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
-      url: ${env.VLLM_URL:http://host.docker.internal:5100/v1}
-      max_tokens: ${env.MAX_TOKENS:4096}
-      api_token: fake
-  # serves safety llama_guard model
-  - provider_id: vllm-1
-    provider_type: remote::vllm
-    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
-      url: ${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
+      url: ${env.VLLM_URL}
      max_tokens: ${env.MAX_TOKENS:4096}
      api_token: fake
  memory:
-  - provider_id: faiss-0
+  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        namespace: null
        type: sqlite
        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config: {}
  memory:
  - provider_id: meta0
    provider_type: inline::faiss
@ -60,9 +46,5 @@ metadata_store:
  type: sqlite
  db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
 models:
-  - model_id: ${env.INFERENCE_MODEL:Llama3.1-8B-Instruct}
-    provider_id: vllm-0
-  - model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
-    provider_id: vllm-1
-shields:
-  - shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
+  - model_id: ${env.INFERENCE_MODEL}
+    provider_id: vllm-inference
--- a/llama_stack/providers/remote/inference/ollama/init.py
+++ b/llama_stack/providers/remote/inference/ollama/init.py
@ -4,37 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional
-
-from llama_stack.distribution.datatypes import RemoteProviderConfig
-from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
+from .config import OllamaImplConfig


-DEFAULT_OLLAMA_PORT = 11434
-
-
-class OllamaImplConfig(RemoteProviderConfig):
-    port: int = DEFAULT_OLLAMA_PORT
-
-    @classmethod
-    def sample_docker_compose_config(cls) -> Optional[DockerComposeServiceConfig]:
-        return DockerComposeServiceConfig(
-            image="ollama/ollama:latest",
-            volumes=["$HOME/.ollama:/root/.ollama"],
-            devices=["nvidia.com/gpu=all"],
-            deploy={
-                "resources": {
-                    "reservations": {
-                        "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
-                    }
-                }
-            },
-            runtime="nvidia",
-            ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
-        )
-
-
-async def get_adapter_impl(config: RemoteProviderConfig, _deps):
+async def get_adapter_impl(config: OllamaImplConfig, _deps):
    from .ollama import OllamaInferenceAdapter

    impl = OllamaInferenceAdapter(config.url)
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.distribution.datatypes import RemoteProviderConfig
+from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
+
+
+DEFAULT_OLLAMA_PORT = 11434
+
+
+class OllamaImplConfig(RemoteProviderConfig):
+    port: int = DEFAULT_OLLAMA_PORT
+
+    @classmethod
+    def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]:
+        return [
+            DockerComposeServiceConfig(
+                service_name="ollama",
+                image="ollama/ollama:latest",
+                volumes=["$HOME/.ollama:/root/.ollama"],
+                devices=["nvidia.com/gpu=all"],
+                deploy={
+                    "resources": {
+                        "reservations": {
+                            "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
+                        }
+                    }
+                },
+                runtime="nvidia",
+                ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
+                healthcheck={
+                    "test": ["CMD", "curl", "-f", "http://ollama:11434"],
+                    "interval": "10s",
+                    "timeout": "5s",
+                    "retries": 5,
+                },
+            ),
+            DockerComposeServiceConfig(
+                service_name="ollama-init",
+                image="ollama/ollama",
+                depends_on={"ollama": {"condition": "service_healthy"}},
+                environment={
+                    "OLLAMA_HOST": "ollama",
+                    "OLLAMA_MODELS": "${OLLAMA_MODELS}",
+                },
+                volumes=["ollama_data:/root/.ollama"],
+                entrypoint=(
+                    'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";'
+                    "until curl -s http://ollama:11434 > /dev/null; do"
+                    "attempt=$((attempt + 1));"
+                    "if [ $attempt -ge $max_attempts ]; then"
+                    'echo "Timeout waiting for Ollama server";'
+                    "exit 1;"
+                    "fi;"
+                    'echo "Attempt $attempt: Server not ready yet...";'
+                    "sleep 5;"
+                    "done'"
+                ),
+            ),
+        ]
--- a/llama_stack/providers/remote/inference/ollama/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/ollama/docker_compose.yaml
@ -0,0 +1,55 @@
+services:
+  ${SERVICE_NAME:-ollama}:
+    image: ollama/ollama:latest
+    ports:
+      - ${OLLAMA_PORT:-11434}:${OLLAMA_PORT:-11434}
+    volumes:
+      - $HOME/.ollama:/root/.ollama
+    devices:
+      - nvidia.com/gpu=all
+    runtime: nvidia
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://ollama:11434"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  ${SERVICE_NAME:-ollama}-init:
+    image: ollama/ollama
+    depends_on:
+      - ${SERVICE_NAME:-ollama}:
+          condition: service_healthy
+    environment:
+      - OLLAMA_HOST=ollama
+      - OLLAMA_MODELS=${OLLAMA_MODELS}
+    volumes:
+      - $HOME/.ollama:/root/.ollama
+    entrypoint: >
+      sh -c '
+        max_attempts=30;
+        attempt=0;
+
+        echo "Waiting for Ollama server...";
+        until curl -s http://ollama:11434 > /dev/null; do
+          attempt=$((attempt + 1));
+          if [ $attempt -ge $max_attempts ]; then
+            echo "Timeout waiting for Ollama server";
+            exit 1;
+          fi;
+          echo "Attempt $attempt: Server not ready yet...";
+          sleep 5;
+        done;
+
+        echo "Server ready. Pulling models...";
+
+        models="${OLLAMA_MODELS}";
+        for model in $models; do
+          echo "Pulling $model...";
+          if ! ollama pull "$model"; then
+            echo "Failed to pull $model";
+            exit 1;
+          fi;
+        done;
+
+        echo "All models pulled successfully"
+      '
--- a/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
@ -0,0 +1,35 @@
+services:
+  ${SERVICE_NAME:-tgi}:
+    image: ghcr.io/huggingface/text-generation-inference:2.3.1
+    network_mode: "host"
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    ports:
+      - ${TGI_PORT:-8000}:${TGI_PORT:-8000}
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --port ${TGI_PORT:-8000}
+      --cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -9,11 +9,6 @@ from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field

-from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
-
-
-DEFAULT_VLLM_PORT = 8000
-

@json_schema_type
 class VLLMInferenceAdapterConfig(BaseModel):
@ -33,48 +28,10 @@ class VLLMInferenceAdapterConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}",
+        url: str = "${env.VLLM_URL}",
    ):
        return {
            "url": url,
            "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
            "api_token": "${env.VLLM_API_TOKEN:fake}",
        }
-
-    @classmethod
-    def sample_docker_compose_config(
-        cls,
-        port: int = DEFAULT_VLLM_PORT,
-        cuda_visible_devices: str = "0",
-        model: str = "meta-llama/Llama-3.2-3B-Instruct",
-    ) -> Optional[DockerComposeServiceConfig]:
-        return DockerComposeServiceConfig(
-            image="vllm/vllm-openai:latest",
-            volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"],
-            devices=["nvidia.com/gpu=all"],
-            deploy={
-                "resources": {
-                    "reservations": {
-                        "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
-                    }
-                }
-            },
-            runtime="nvidia",
-            ports=[f"{port}:{port}"],
-            environment={
-                "CUDA_VISIBLE_DEVICES": cuda_visible_devices,
-                "HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN",
-            },
-            command=(
-                " ".join(
-                    [
-                        "--gpu-memory-utilization 0.75",
-                        f"--model {model}",
-                        "--enforce-eager",
-                        "--max-model-len 8192",
-                        "--max-num-seqs 16",
-                        f"--port {port}",
-                    ]
-                )
-            ),
-        )
--- a/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
@ -0,0 +1,26 @@
+services:
+  ${SERVICE_NAME:-vllm}:
+    image: vllm/vllm-openai:latest
+    ports:
+      - ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
+    volumes:
+      - $HOME/.cache/huggingface:/root/.cache/huggingface
+    devices:
+      - nvidia.com/gpu=all
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [gpu]
+    runtime: nvidia
+    environment:
+      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
+      - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
+    command: >
+      --gpu-memory-utilization 0.75
+      --model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --enforce-eager
+      --max-model-len 8192
+      --max-num-seqs 16
+      --port ${VLLM_PORT:-5100}
--- a/llama_stack/providers/utils/docker/init.py
+++ b/llama_stack/providers/utils/docker/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/utils/docker/service_config.py
+++ b/llama_stack/providers/utils/docker/service_config.py
@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Dict, List, Optional
-
-from pydantic import BaseModel
-
-
-class LiteralString(str):
-    pass  # Marker class for strings we want to format with >
-
-
-class DockerComposeServiceConfig(BaseModel):
-    """Configuration for a single service in docker-compose."""
-
-    image: str
-    volumes: Optional[List[str]] = None
-    network_mode: str = "bridge"
-    ports: Optional[List[str]] = None
-    devices: Optional[List[str]] = None
-    environment: Optional[Dict[str, str]] = None
-    command: Optional[str] = None
-    depends_on: Optional[List[str]] = None
-    deploy: Optional[Dict[str, Any]] = None
-    runtime: Optional[str] = None
-    entrypoint: Optional[str] = None
--- a/llama_stack/providers/utils/kvstore/config.py
+++ b/llama_stack/providers/utils/kvstore/config.py
@ -54,11 +54,11 @@ class SqliteKVStoreConfig(CommonConfig):
    )

    @classmethod
-    def sample_run_config(cls, db_name: str = "kvstore.db"):
+    def sample_run_config(cls, dir: str = "runtime", db_name: str = "kvstore.db"):
        return {
            "type": "sqlite",
            "namespace": None,
-            "db_path": "${env.SQLITE_STORE_DIR:~/.llama/runtime/" + db_name + "}",
+            "db_path": "${env.SQLITE_STORE_DIR:~/.llama/" + f"{dir}/{db_name}" + "}",
        }


--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@ -0,0 +1,95 @@
+# Remote vLLM Distribution
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
+
+{{ providers_table }}
+
+You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
+
+{%- if docker_compose_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in docker_compose_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{% if default_models %}
+### Models
+
+The following models are configured by default:
+{% for model in default_models %}
+- `{{ model.model_id }}`
+{% endfor %}
+{% endif %}
+
+## Using Docker Compose
+
+You can use `docker compose` to start a vLLM container and Llama Stack server container together.
+```bash
+$ cd distributions/{{ name }}; docker compose up
+```
+
+You will see outputs similar to following ---
+```
+<TO BE FILLED>
+```
+
+To kill the server
+```bash
+docker compose down
+```
+
+## Starting vLLM and Llama Stack separately
+
+You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
+
+#### Start vLLM server.
+
+```bash
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai:latest \
+    --model meta-llama/Llama-3.2-3B-Instruct
+```
+
+Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
+
+
+#### Start Llama Stack server pointing to your vLLM server
+
+
+We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following:
+```yaml
+inference:
+  - provider_id: vllm0
+    provider_type: remote::vllm
+    config:
+      url: http://127.0.0.1:8000
+```
+
+**Via Conda**
+
+If you are using Conda, you can build and run the Llama Stack server with the following commands:
+```bash
+cd distributions/remote-vllm
+llama stack build --template remote_vllm --image-type conda
+llama stack run run.yaml
+```
+
+**Via Docker**
+
+You can use the Llama Stack Docker image to start the server with the following command:
+```bash
+docker run --network host -it -p 5000:5000 \
+  -v ~/.llama:/root/.llama \
+  -v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \
+  --gpus=all \
+  llamastack/distribution-remote-vllm \
+  --yaml_config /root/llamastack-run-remote-vllm.yaml
+```
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -9,7 +9,7 @@ from datetime import datetime
 from io import StringIO

 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple

 import jinja2
 import yaml
@ -22,7 +22,6 @@ from llama_stack.distribution.datatypes import (
    Api,
    BuildConfig,
    DistributionSpec,
-    KVStoreConfig,
    ModelInput,
    Provider,
    ShieldInput,
@ -33,53 +32,26 @@ from llama_stack.distribution.utils.dynamic import instantiate_class_type
 from llama_stack.providers.remote.inference.vllm.config import (
    VLLMInferenceAdapterConfig,
 )
-from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
+from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig


-class DistributionTemplate(BaseModel):
-    """
-    Represents a Llama Stack distribution instance that can generate configuration
-    and documentation files.
-    """
-
-    name: str
-    description: str
-    providers: Dict[str, List[str]]
-    run_config_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
-    compose_config_overrides: Dict[str, Dict[str, DockerComposeServiceConfig]] = Field(
-        default_factory=dict
-    )
-
+class RunConfigSettings(BaseModel):
+    provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
    default_models: List[ModelInput]
    default_shields: Optional[List[ShieldInput]] = None

-    # Optional configuration
-    metadata_store: Optional[KVStoreConfig] = None
-    docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
-    docker_image: Optional[str] = None
-
-    @property
-    def distribution_spec(self) -> DistributionSpec:
-        return DistributionSpec(
-            description=self.description,
-            docker_image=self.docker_image,
-            providers=self.providers,
-        )
-
-    def build_config(self) -> BuildConfig:
-        return BuildConfig(
-            name=self.name,
-            distribution_spec=self.distribution_spec,
-            image_type="conda",  # default to conda, can be overridden
-        )
-
-    def run_config(self) -> StackRunConfig:
+    def run_config(
+        self,
+        name: str,
+        providers: Dict[str, List[str]],
+        docker_image: Optional[str] = None,
+    ) -> StackRunConfig:
        provider_registry = get_provider_registry()

        provider_configs = {}
-        for api_str, provider_types in self.providers.items():
-            if providers := self.run_config_overrides.get(api_str):
-                provider_configs[api_str] = providers
+        for api_str, provider_types in providers.items():
+            if api_providers := self.provider_overrides.get(api_str):
+                provider_configs[api_str] = api_providers
                continue

            provider_type = provider_types[0]
@ -111,83 +83,53 @@ class DistributionTemplate(BaseModel):
            ]

        # Get unique set of APIs from providers
-        apis: Set[str] = set(self.providers.keys())
+        apis: Set[str] = set(providers.keys())

        return StackRunConfig(
-            image_name=self.name,
-            docker_image=self.docker_image,
+            image_name=name,
+            docker_image=docker_image,
            built_at=datetime.now(),
            apis=list(apis),
            providers=provider_configs,
-            metadata_store=self.metadata_store,
+            metadata_store=SqliteKVStoreConfig.sample_run_config(
+                dir=f"distributions/{name}",
+                db_name="registry.db",
+            ),
            models=self.default_models,
            shields=self.default_shields or [],
        )

-    def docker_compose_config(self) -> Dict[str, Any]:
-        services = {}
-        provider_registry = get_provider_registry()

-        # Add provider services based on their sample_compose_config
-        for api_str, api_providers in self.providers.items():
-            if overrides := self.compose_config_overrides.get(api_str):
-                services |= overrides
-                continue
+class DistributionTemplate(BaseModel):
+    """
+    Represents a Llama Stack distribution instance that can generate configuration
+    and documentation files.
+    """

-            # only look at the first provider to get the compose config for now
-            # we may want to use `docker compose profiles` in the future
-            provider_type = api_providers[0]
-            provider_id = provider_type.split("::")[-1]
-            api = Api(api_str)
-            if provider_type not in provider_registry[api]:
-                raise ValueError(
-                    f"Unknown provider type: {provider_type} for API: {api_str}"
-                )
+    name: str
+    description: str

-            config_class = provider_registry[api][provider_type].config_class
-            assert (
-                config_class is not None
-            ), f"No config class for provider type: {provider_type} for API: {api_str}"
+    providers: Dict[str, List[str]]
+    run_configs: Dict[str, RunConfigSettings]
+    template_path: Path

-            config_class = instantiate_class_type(config_class)
-            if not hasattr(config_class, "sample_docker_compose_config"):
-                continue
+    # Optional configuration
+    docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
+    docker_image: Optional[str] = None

-            compose_config = config_class.sample_docker_compose_config()
-            services[provider_id] = compose_config
+    default_models: Optional[List[ModelInput]] = None

-        port = "${LLAMASTACK_PORT:-5001}"
-        # Add main llamastack service
-        llamastack_config = DockerComposeServiceConfig(
-            image=f"llamastack/distribution-{self.name}:latest",
-            depends_on=list(services.keys()),
-            volumes=[
-                "~/.llama:/root/.llama",
-                f"~/local/llama-stack/distributions/{self.name}/run.yaml:/root/llamastack-run-{self.name}.yaml",
-            ],
-            ports=[f"{port}:{port}"],
-            environment={
-                k: v[0] for k, v in (self.docker_compose_env_vars or {}).items()
-            },
-            entrypoint=(
-                f'bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-{self.name}.yaml --port {port}"'
+    def build_config(self) -> BuildConfig:
+        return BuildConfig(
+            name=self.name,
+            distribution_spec=DistributionSpec(
+                description=self.description,
+                docker_image=self.docker_image,
+                providers=self.providers,
            ),
-            deploy={
-                "restart_policy": {
-                    "condition": "on-failure",
-                    "delay": "3s",
-                    "max_attempts": 5,
-                    "window": "60s",
-                }
-            },
+            image_type="conda",  # default to conda, can be overridden
        )

-        services["llamastack"] = llamastack_config
-        return {
-            "services": {k: v.model_dump() for k, v in services.items()},
-            "volumes": {service_name: None for service_name in services.keys()},
-        }
-
    def generate_markdown_docs(self) -> str:
        """Generate markdown documentation using both Jinja2 templates and rich tables."""
        # First generate the providers table using rich
@ -204,53 +146,7 @@ class DistributionTemplate(BaseModel):
        console.print(table)
        providers_table = output.getvalue()

-        # Main documentation template
-        template = """# {{ name }} Distribution
-
-{{ description }}
-
-## Provider Configuration
-
-The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
-
-{{ providers_table }}
-
-{%- if env_vars %}
-## Environment Variables
-
-The following environment variables can be configured:
-
-{% for var, (value, description) in docker_compose_env_vars.items() %}
- `{{ var }}`: {{ description }}
-{% endfor %}
-{%- endif %}
-
-## Example Usage
-
-### Using Docker Compose
-
-```bash
-$ cd distributions/{{ name }}
-$ docker compose up
-```
-
-## Models
-
-The following models are configured by default:
-{% for model in default_models %}
- `{{ model.model_id }}`
-{% endfor %}
-
-{%- if default_shields %}
-
-## Safety Shields
-
-The following safety shields are configured:
-{% for shield in default_shields %}
- `{{ shield.shield_id }}`
-{%- endfor %}
-{%- endif %}
-"""
+        template = self.template_path.read_text()
        # Render template with rich-generated table
        env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True)
        template = env.from_string(template)
@ -261,7 +157,6 @@ The following safety shields are configured:
            providers_table=providers_table,
            docker_compose_env_vars=self.docker_compose_env_vars,
            default_models=self.default_models,
-            default_shields=self.default_shields,
        )

    def save_distribution(self, output_dir: Path) -> None:
@ -271,19 +166,14 @@ The following safety shields are configured:
        with open(output_dir / "build.yaml", "w") as f:
            yaml.safe_dump(build_config.model_dump(), f, sort_keys=False)

-        run_config = self.run_config()
-        serialized = run_config.model_dump()
-        with open(output_dir / "run.yaml", "w") as f:
-            yaml.safe_dump(serialized, f, sort_keys=False)
-
-        # serialized_str = yaml.dump(serialized, sort_keys=False)
-        # env_vars = set()
-        # for match in re.finditer(r"\${env\.([A-Za-z0-9_-]+)}", serialized_str):
-        #     env_vars.add(match.group(1))
-
-        docker_compose = self.docker_compose_config()
-        with open(output_dir / "compose.yaml", "w") as f:
-            yaml.safe_dump(docker_compose, f, sort_keys=False, default_flow_style=False)
+        for yaml_pth, settings in self.run_configs.items():
+            print(f"Generating {yaml_pth}")
+            print(f"Providers: {self.providers}")
+            run_config = settings.run_config(
+                self.name, self.providers, self.docker_image
+            )
+            with open(output_dir / yaml_pth, "w") as f:
+                yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)

        docs = self.generate_markdown_docs()
        with open(output_dir / f"{self.name}.md", "w") as f:
@ -291,87 +181,89 @@ The following safety shields are configured:

    @classmethod
    def vllm_distribution(cls) -> "DistributionTemplate":
+        providers = {
+            "inference": ["remote::vllm"],
+            "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+            "safety": ["inline::llama-guard"],
+            "agents": ["inline::meta-reference"],
+            "telemetry": ["inline::meta-reference"],
+        }
+
+        inference_provider = Provider(
+            provider_id="vllm-inference",
+            provider_type="remote::vllm",
+            config=VLLMInferenceAdapterConfig.sample_run_config(
+                url="${env.VLLM_URL}",
+            ),
+        )
+
+        inference_model = ModelInput(
+            model_id="${env.INFERENCE_MODEL}",
+            provider_id="vllm-inference",
+        )
+        safety_model = ModelInput(
+            model_id="${env.SAFETY_MODEL}",
+            provider_id="vllm-safety",
+        )
+
        return cls(
            name="remote-vllm",
            description="Use (an external) vLLM server for running LLM inference",
-            providers={
-                "inference": ["remote::vllm"],
-                "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-                "safety": ["inline::llama-guard"],
-                "agents": ["inline::meta-reference"],
-                "telemetry": ["inline::meta-reference"],
-            },
-            run_config_overrides={
-                "inference": [
-                    Provider(
-                        provider_id="vllm-0",
-                        provider_type="remote::vllm",
-                        config=VLLMInferenceAdapterConfig.sample_run_config(
-                            url="${env.VLLM_URL:http://host.docker.internal:5100/v1}",
-                        ),
-                    ),
-                    Provider(
-                        provider_id="vllm-1",
-                        provider_type="remote::vllm",
-                        config=VLLMInferenceAdapterConfig.sample_run_config(
-                            url="${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}",
-                        ),
-                    ),
-                ]
-            },
-            compose_config_overrides={
-                "inference": {
-                    "vllm-0": VLLMInferenceAdapterConfig.sample_docker_compose_config(
-                        port=5100,
-                        cuda_visible_devices="0",
-                        model="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
-                    ),
-                    "vllm-1": VLLMInferenceAdapterConfig.sample_docker_compose_config(
-                        port=5100,
-                        cuda_visible_devices="1",
-                        model="${env.SAFETY_MODEL:Llama-Guard-3-1B}",
-                    ),
-                }
-            },
-            default_models=[
-                ModelInput(
-                    model_id="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
-                    provider_id="vllm-0",
+            template_path=Path(__file__).parent / "remote-vllm" / "doc_template.md",
+            providers=providers,
+            default_models=[inference_model, safety_model],
+            run_configs={
+                "run.yaml": RunConfigSettings(
+                    provider_overrides={
+                        "inference": [inference_provider],
+                    },
+                    default_models=[inference_model],
                ),
-                ModelInput(
-                    model_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}",
-                    provider_id="vllm-1",
+                "safety-run.yaml": RunConfigSettings(
+                    provider_overrides={
+                        "inference": [
+                            inference_provider,
+                            Provider(
+                                provider_id="vllm-safety",
+                                provider_type="remote::vllm",
+                                config=VLLMInferenceAdapterConfig.sample_run_config(
+                                    url="${env.SAFETY_VLLM_URL}",
+                                ),
+                            ),
+                        ],
+                    },
+                    default_models=[
+                        inference_model,
+                        safety_model,
+                    ],
+                    default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
                ),
-            ],
-            default_shields=[
-                ShieldInput(shield_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}")
-            ],
+            },
            docker_compose_env_vars={
-                # these defaults are for the Docker Compose configuration
-                "VLLM_URL": (
-                    "http://host.docker.internal:${VLLM_PORT:-5100}/v1",
-                    "URL of the vLLM server with the main inference model",
-                ),
-                "SAFETY_VLLM_URL": (
-                    "http://host.docker.internal:${SAFETY_VLLM_PORT:-5101}/v1",
-                    "URL of the vLLM server with the safety model",
-                ),
-                "MAX_TOKENS": (
-                    "${MAX_TOKENS:-4096}",
-                    "Maximum number of tokens for generation",
+                "LLAMASTACK_PORT": (
+                    "5001",
+                    "Port for the Llama Stack distribution server",
                ),
                "INFERENCE_MODEL": (
-                    "${INFERENCE_MODEL:-Llama3.2-3B-Instruct}",
-                    "Name of the inference model to use",
+                    "meta-llama/Llama-3.2-3B-Instruct",
+                    "Inference model loaded into the vLLM server",
+                ),
+                "VLLM_URL": (
+                    "http://host.docker.internal:5100}/v1",
+                    "URL of the vLLM server with the main inference model",
+                ),
+                "MAX_TOKENS": (
+                    "4096",
+                    "Maximum number of tokens for generation",
+                ),
+                "SAFETY_VLLM_URL": (
+                    "http://host.docker.internal:5101/v1",
+                    "URL of the vLLM server with the safety model",
                ),
                "SAFETY_MODEL": (
-                    "${SAFETY_MODEL:-Llama-Guard-3-1B}",
+                    "meta-llama/Llama-Guard-3-1B",
                    "Name of the safety (Llama-Guard) model to use",
                ),
-                "LLAMASTACK_PORT": (
-                    "${LLAMASTACK_PORT:-5001}",
-                    "Port for the Llama Stack distribution server",
-                ),
            },
        )