more progress on auto-generation

2025-12-17 17:29:50 +00:00 · 2024-11-15 09:35:38 -08:00 · 2024-11-15 09:35:38 -08:00 · e4509cb568
commit e4509cb568
parent cfa913fdd5
10 changed files with 309 additions and 73 deletions
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -9,6 +9,11 @@ from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field

+from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
+
+
+DEFAULT_VLLM_PORT = 8000
+

@json_schema_type
 class VLLMInferenceAdapterConfig(BaseModel):
@ -26,10 +31,50 @@ class VLLMInferenceAdapterConfig(BaseModel):
    )

    @classmethod
-    def sample_dict(cls):
-        # TODO: we may need two modes, one for conda and one for docker
+    def sample_run_config(
+        cls,
+        url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}",
+    ):
        return {
-            "url": "${env.VLLM_URL:http://host.docker.internal:5100/v1}",
+            "url": url,
            "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
            "api_token": "${env.VLLM_API_TOKEN:fake}",
        }
+
+    @classmethod
+    def sample_docker_compose_config(
+        cls,
+        port: int = DEFAULT_VLLM_PORT,
+        cuda_visible_devices: str = "0",
+        model: str = "meta-llama/Llama-3.2-3B-Instruct",
+    ) -> Optional[DockerComposeServiceConfig]:
+        return DockerComposeServiceConfig(
+            image="vllm/vllm-openai:latest",
+            volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"],
+            devices=["nvidia.com/gpu=all"],
+            deploy={
+                "resources": {
+                    "reservations": {
+                        "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
+                    }
+                }
+            },
+            runtime="nvidia",
+            ports=[f"{port}:{port}"],
+            environment={
+                "CUDA_VISIBLE_DEVICES": cuda_visible_devices,
+                "HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN",
+            },
+            command=(
+                " ".join(
+                    [
+                        "--gpu-memory-utilization 0.75",
+                        f"--model {model}",
+                        "--enforce-eager",
+                        "--max-model-len 8192",
+                        "--max-num-seqs 16",
+                        f"--port {port}",
+                    ]
+                )
+            ),
+        )