Adding docker-compose.yaml, starting to simplify

2025-12-17 09:22:36 +00:00 · 2024-11-16 10:56:38 -08:00 · 2024-11-16 10:56:38 -08:00 · f38e76ee98
commit f38e76ee98
parent e4509cb568
14 changed files with 516 additions and 386 deletions
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -9,11 +9,6 @@ from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field

-from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
-
-
-DEFAULT_VLLM_PORT = 8000
-

@json_schema_type
 class VLLMInferenceAdapterConfig(BaseModel):
@ -33,48 +28,10 @@ class VLLMInferenceAdapterConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}",
+        url: str = "${env.VLLM_URL}",
    ):
        return {
            "url": url,
            "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
            "api_token": "${env.VLLM_API_TOKEN:fake}",
        }
-
-    @classmethod
-    def sample_docker_compose_config(
-        cls,
-        port: int = DEFAULT_VLLM_PORT,
-        cuda_visible_devices: str = "0",
-        model: str = "meta-llama/Llama-3.2-3B-Instruct",
-    ) -> Optional[DockerComposeServiceConfig]:
-        return DockerComposeServiceConfig(
-            image="vllm/vllm-openai:latest",
-            volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"],
-            devices=["nvidia.com/gpu=all"],
-            deploy={
-                "resources": {
-                    "reservations": {
-                        "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
-                    }
-                }
-            },
-            runtime="nvidia",
-            ports=[f"{port}:{port}"],
-            environment={
-                "CUDA_VISIBLE_DEVICES": cuda_visible_devices,
-                "HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN",
-            },
-            command=(
-                " ".join(
-                    [
-                        "--gpu-memory-utilization 0.75",
-                        f"--model {model}",
-                        "--enforce-eager",
-                        "--max-model-len 8192",
-                        "--max-num-seqs 16",
-                        f"--port {port}",
-                    ]
-                )
-            ),
-        )
--- a/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
@ -0,0 +1,26 @@
+services:
+  ${SERVICE_NAME:-vllm}:
+    image: vllm/vllm-openai:latest
+    ports:
+      - ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
+    volumes:
+      - $HOME/.cache/huggingface:/root/.cache/huggingface
+    devices:
+      - nvidia.com/gpu=all
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [gpu]
+    runtime: nvidia
+    environment:
+      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
+      - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
+    command: >
+      --gpu-memory-utilization 0.75
+      --model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --enforce-eager
+      --max-model-len 8192
+      --max-num-seqs 16
+      --port ${VLLM_PORT:-5100}