Adding docker-compose.yaml, starting to simplify

2025-12-17 09:12:37 +00:00 · 2024-11-16 10:56:38 -08:00 · 2024-11-16 10:56:38 -08:00 · f38e76ee98
commit f38e76ee98
parent e4509cb568
14 changed files with 516 additions and 386 deletions
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -9,25 +9,30 @@
 # Similarly change "host.docker.internal" to "localhost" in the run.yaml file
 #
 services:
-  vllm-0:
+  vllm-inference:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-       - "5100:5100"
+       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=0
+      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-3.1-8B-Instruct
+      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5100
+      --port ${VLLM_INFERENCE_PORT:-5100}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
    deploy:
      resources:
        reservations:
@ -35,25 +40,34 @@ services:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
-  vllm-1:
+
+  # A little trick:
+  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
+  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
+  vllm-${VLLM_SAFETY_MODEL:+safety}:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-      - "5101:5101"
+      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=1
+      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-Guard-3-1B
+      --model ${VLLM_SAFETY_MODEL}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5101
+      --port ${VLLM_SAFETY_PORT:-5101}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
    deploy:
      resources:
        reservations:
@ -63,23 +77,25 @@ services:
    runtime: nvidia
  llamastack:
    depends_on:
-    - vllm-0
-    - vllm-1
-      # image: llamastack/distribution-remote-vllm
+      - vllm-inference:
+          condition: service_healthy
+      - vllm-${VLLM_SAFETY_MODEL:+safety}:
+          condition: service_healthy
+    # image: llamastack/distribution-remote-vllm
    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
    volumes:
      - ~/.llama:/root/.llama
-      - ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
-    # network_mode: "host"
+      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
+    network_mode: ${NETWORK_MODE:-bridged}
    environment:
-      - LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1}
-      - LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct}
+      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
+      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      - MAX_TOKENS=${MAX_TOKENS:-4096}
      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
-      - LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1}
-      - LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
    ports:
-      - "5001:5001"
+      - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
    # Hack: wait for vLLM server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
    deploy:
@ -89,6 +105,6 @@ services:
        max_attempts: 5
        window: 60s
 volumes:
-  vllm-0:
-  vllm-1:
+  vllm-inference:
+  vllm-safety:
  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -0,0 +1,68 @@
+version: '2'
+built_at: '2024-11-11T20:09:45.988375'
+image_name: remote-vllm
+docker_image: remote-vllm
+conda_env: null
+apis:
+- inference
+- memory
+- safety
+- agents
+- telemetry
+providers:
+  inference:
+  # serves main inference model
+  - provider_id: vllm-inference
+    provider_type: remote::vllm
+    config:
+      # NOTE: replace with "localhost" if you are running in "host" network mode
+      url: ${env.VLLM_URL}
+      max_tokens: ${env.MAX_TOKENS:4096}
+      api_token: fake
+  # serves safety llama_guard model
+  - provider_id: vllm-safety
+    provider_type: remote::vllm
+    config:
+      # NOTE: replace with "localhost" if you are running in "host" network mode
+      url: ${env.SAFETY_VLLM_URL}
+      max_tokens: ${env.MAX_TOKENS:4096}
+      api_token: fake
+  memory:
+  - provider_id: faiss-0
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        namespace: null
+        type: sqlite
+        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  memory:
+  - provider_id: meta0
+    provider_type: inline::faiss
+    config: {}
+  agents:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db"
+  telemetry:
+  - provider_id: meta0
+    provider_type: inline::meta-reference
+    config: {}
+metadata_store:
+  namespace: null
+  type: sqlite
+  db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
+models:
+  - model_id: ${env.INFERENCE_MODEL}
+    provider_id: vllm-inference
+  - model_id: ${env.SAFETY_MODEL}
+    provider_id: vllm-safety
+shields:
+  - shield_id: ${env.SAFETY_MODEL}
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -6,39 +6,25 @@ conda_env: null
 apis:
 - inference
 - memory
- safety
 - agents
 - telemetry
 providers:
  inference:
  # serves main inference model
-  - provider_id: vllm-0
+  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
-      url: ${env.VLLM_URL:http://host.docker.internal:5100/v1}
-      max_tokens: ${env.MAX_TOKENS:4096}
-      api_token: fake
-  # serves safety llama_guard model
-  - provider_id: vllm-1
-    provider_type: remote::vllm
-    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
-      url: ${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
+      url: ${env.VLLM_URL}
      max_tokens: ${env.MAX_TOKENS:4096}
      api_token: fake
  memory:
-  - provider_id: faiss-0
+  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
        namespace: null
        type: sqlite
        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config: {}
  memory:
  - provider_id: meta0
    provider_type: inline::faiss
@ -60,9 +46,5 @@ metadata_store:
  type: sqlite
  db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
 models:
-  - model_id: ${env.INFERENCE_MODEL:Llama3.1-8B-Instruct}
-    provider_id: vllm-0
-  - model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
-    provider_id: vllm-1
-shields:
-  - shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
+  - model_id: ${env.INFERENCE_MODEL}
+    provider_id: vllm-inference