Adding docker-compose.yaml, starting to simplify

2025-12-17 09:22:36 +00:00 · 2024-11-16 10:56:38 -08:00 · 2024-11-16 10:56:38 -08:00 · f38e76ee98
commit f38e76ee98
parent e4509cb568
14 changed files with 516 additions and 386 deletions
--- a/llama_stack/providers/remote/inference/ollama/init.py
+++ b/llama_stack/providers/remote/inference/ollama/init.py
@ -4,37 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional
-
-from llama_stack.distribution.datatypes import RemoteProviderConfig
-from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
+from .config import OllamaImplConfig


-DEFAULT_OLLAMA_PORT = 11434
-
-
-class OllamaImplConfig(RemoteProviderConfig):
-    port: int = DEFAULT_OLLAMA_PORT
-
-    @classmethod
-    def sample_docker_compose_config(cls) -> Optional[DockerComposeServiceConfig]:
-        return DockerComposeServiceConfig(
-            image="ollama/ollama:latest",
-            volumes=["$HOME/.ollama:/root/.ollama"],
-            devices=["nvidia.com/gpu=all"],
-            deploy={
-                "resources": {
-                    "reservations": {
-                        "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
-                    }
-                }
-            },
-            runtime="nvidia",
-            ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
-        )
-
-
-async def get_adapter_impl(config: RemoteProviderConfig, _deps):
+async def get_adapter_impl(config: OllamaImplConfig, _deps):
    from .ollama import OllamaInferenceAdapter

    impl = OllamaInferenceAdapter(config.url)
--- a/llama_stack/providers/remote/inference/ollama/config.py
+++ b/llama_stack/providers/remote/inference/ollama/config.py
@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from llama_stack.distribution.datatypes import RemoteProviderConfig
+from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
+
+
+DEFAULT_OLLAMA_PORT = 11434
+
+
+class OllamaImplConfig(RemoteProviderConfig):
+    port: int = DEFAULT_OLLAMA_PORT
+
+    @classmethod
+    def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]:
+        return [
+            DockerComposeServiceConfig(
+                service_name="ollama",
+                image="ollama/ollama:latest",
+                volumes=["$HOME/.ollama:/root/.ollama"],
+                devices=["nvidia.com/gpu=all"],
+                deploy={
+                    "resources": {
+                        "reservations": {
+                            "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
+                        }
+                    }
+                },
+                runtime="nvidia",
+                ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
+                healthcheck={
+                    "test": ["CMD", "curl", "-f", "http://ollama:11434"],
+                    "interval": "10s",
+                    "timeout": "5s",
+                    "retries": 5,
+                },
+            ),
+            DockerComposeServiceConfig(
+                service_name="ollama-init",
+                image="ollama/ollama",
+                depends_on={"ollama": {"condition": "service_healthy"}},
+                environment={
+                    "OLLAMA_HOST": "ollama",
+                    "OLLAMA_MODELS": "${OLLAMA_MODELS}",
+                },
+                volumes=["ollama_data:/root/.ollama"],
+                entrypoint=(
+                    'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";'
+                    "until curl -s http://ollama:11434 > /dev/null; do"
+                    "attempt=$((attempt + 1));"
+                    "if [ $attempt -ge $max_attempts ]; then"
+                    'echo "Timeout waiting for Ollama server";'
+                    "exit 1;"
+                    "fi;"
+                    'echo "Attempt $attempt: Server not ready yet...";'
+                    "sleep 5;"
+                    "done'"
+                ),
+            ),
+        ]
--- a/llama_stack/providers/remote/inference/ollama/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/ollama/docker_compose.yaml
@ -0,0 +1,55 @@
+services:
+  ${SERVICE_NAME:-ollama}:
+    image: ollama/ollama:latest
+    ports:
+      - ${OLLAMA_PORT:-11434}:${OLLAMA_PORT:-11434}
+    volumes:
+      - $HOME/.ollama:/root/.ollama
+    devices:
+      - nvidia.com/gpu=all
+    runtime: nvidia
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://ollama:11434"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+
+  ${SERVICE_NAME:-ollama}-init:
+    image: ollama/ollama
+    depends_on:
+      - ${SERVICE_NAME:-ollama}:
+          condition: service_healthy
+    environment:
+      - OLLAMA_HOST=ollama
+      - OLLAMA_MODELS=${OLLAMA_MODELS}
+    volumes:
+      - $HOME/.ollama:/root/.ollama
+    entrypoint: >
+      sh -c '
+        max_attempts=30;
+        attempt=0;
+
+        echo "Waiting for Ollama server...";
+        until curl -s http://ollama:11434 > /dev/null; do
+          attempt=$((attempt + 1));
+          if [ $attempt -ge $max_attempts ]; then
+            echo "Timeout waiting for Ollama server";
+            exit 1;
+          fi;
+          echo "Attempt $attempt: Server not ready yet...";
+          sleep 5;
+        done;
+
+        echo "Server ready. Pulling models...";
+
+        models="${OLLAMA_MODELS}";
+        for model in $models; do
+          echo "Pulling $model...";
+          if ! ollama pull "$model"; then
+            echo "Failed to pull $model";
+            exit 1;
+          fi;
+        done;
+
+        echo "All models pulled successfully"
+      '
--- a/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/tgi/docker_compose.yaml
@ -0,0 +1,35 @@
+services:
+  ${SERVICE_NAME:-tgi}:
+    image: ghcr.io/huggingface/text-generation-inference:2.3.1
+    network_mode: "host"
+    volumes:
+      - $HOME/.cache/huggingface:/data
+    ports:
+      - ${TGI_PORT:-8000}:${TGI_PORT:-8000}
+    devices:
+      - nvidia.com/gpu=all
+    environment:
+      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
+      - HF_HOME=/data
+      - HF_DATASETS_CACHE=/data
+      - HF_MODULES_CACHE=/data
+      - HF_HUB_CACHE=/data
+    command: >
+      --dtype bfloat16
+      --usage-stats off
+      --sharded false
+      --model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --port ${TGI_PORT:-8000}
+      --cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
+    deploy:
+      resources:
+        reservations:
+          devices:
+          - driver: nvidia
+            capabilities: [gpu]
+    runtime: nvidia
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
+      interval: 5s
+      timeout: 5s
+      retries: 30
--- a/llama_stack/providers/remote/inference/vllm/config.py
+++ b/llama_stack/providers/remote/inference/vllm/config.py
@ -9,11 +9,6 @@ from typing import Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field

-from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
-
-
-DEFAULT_VLLM_PORT = 8000
-

@json_schema_type
 class VLLMInferenceAdapterConfig(BaseModel):
@ -33,48 +28,10 @@ class VLLMInferenceAdapterConfig(BaseModel):
    @classmethod
    def sample_run_config(
        cls,
-        url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}",
+        url: str = "${env.VLLM_URL}",
    ):
        return {
            "url": url,
            "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
            "api_token": "${env.VLLM_API_TOKEN:fake}",
        }
-
-    @classmethod
-    def sample_docker_compose_config(
-        cls,
-        port: int = DEFAULT_VLLM_PORT,
-        cuda_visible_devices: str = "0",
-        model: str = "meta-llama/Llama-3.2-3B-Instruct",
-    ) -> Optional[DockerComposeServiceConfig]:
-        return DockerComposeServiceConfig(
-            image="vllm/vllm-openai:latest",
-            volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"],
-            devices=["nvidia.com/gpu=all"],
-            deploy={
-                "resources": {
-                    "reservations": {
-                        "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
-                    }
-                }
-            },
-            runtime="nvidia",
-            ports=[f"{port}:{port}"],
-            environment={
-                "CUDA_VISIBLE_DEVICES": cuda_visible_devices,
-                "HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN",
-            },
-            command=(
-                " ".join(
-                    [
-                        "--gpu-memory-utilization 0.75",
-                        f"--model {model}",
-                        "--enforce-eager",
-                        "--max-model-len 8192",
-                        "--max-num-seqs 16",
-                        f"--port {port}",
-                    ]
-                )
-            ),
-        )
--- a/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
+++ b/llama_stack/providers/remote/inference/vllm/docker_compose.yaml
@ -0,0 +1,26 @@
+services:
+  ${SERVICE_NAME:-vllm}:
+    image: vllm/vllm-openai:latest
+    ports:
+      - ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
+    volumes:
+      - $HOME/.cache/huggingface:/root/.cache/huggingface
+    devices:
+      - nvidia.com/gpu=all
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [gpu]
+    runtime: nvidia
+    environment:
+      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
+      - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
+    command: >
+      --gpu-memory-utilization 0.75
+      --model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
+      --enforce-eager
+      --max-model-len 8192
+      --max-num-seqs 16
+      --port ${VLLM_PORT:-5100}
--- a/llama_stack/providers/utils/docker/init.py
+++ b/llama_stack/providers/utils/docker/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/providers/utils/docker/service_config.py
+++ b/llama_stack/providers/utils/docker/service_config.py
@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Dict, List, Optional
-
-from pydantic import BaseModel
-
-
-class LiteralString(str):
-    pass  # Marker class for strings we want to format with >
-
-
-class DockerComposeServiceConfig(BaseModel):
-    """Configuration for a single service in docker-compose."""
-
-    image: str
-    volumes: Optional[List[str]] = None
-    network_mode: str = "bridge"
-    ports: Optional[List[str]] = None
-    devices: Optional[List[str]] = None
-    environment: Optional[Dict[str, str]] = None
-    command: Optional[str] = None
-    depends_on: Optional[List[str]] = None
-    deploy: Optional[Dict[str, Any]] = None
-    runtime: Optional[str] = None
-    entrypoint: Optional[str] = None
--- a/llama_stack/providers/utils/kvstore/config.py
+++ b/llama_stack/providers/utils/kvstore/config.py
@ -54,11 +54,11 @@ class SqliteKVStoreConfig(CommonConfig):
    )

    @classmethod
-    def sample_run_config(cls, db_name: str = "kvstore.db"):
+    def sample_run_config(cls, dir: str = "runtime", db_name: str = "kvstore.db"):
        return {
            "type": "sqlite",
            "namespace": None,
-            "db_path": "${env.SQLITE_STORE_DIR:~/.llama/runtime/" + db_name + "}",
+            "db_path": "${env.SQLITE_STORE_DIR:~/.llama/" + f"{dir}/{db_name}" + "}",
        }