Auto-generate distro yamls + docs (#468)

# What does this PR do? Automatically generates - build.yaml - run.yaml - run-with-safety.yaml - parts of markdown docs for the distributions. ## Test Plan At this point, this only updates the YAMLs and the docs. Some testing (especially with ollama and vllm) has been performed but needs to be much more tested.
2024-11-18 14:57:06 -08:00 · 2024-11-18 14:57:06 -08:00 · 2a31163178
commit 2a31163178
parent 0784284ab5
88 changed files with 3008 additions and 852 deletions
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -1,33 +1,28 @@
-# NOTES:
-#
-# This Docker Compose (and the associated run.yaml) assumes you will be
-# running in the default "bridged" network mode.
-#
-# If you need "host" network mode, please uncomment
-#  - network_mode: "host"
-#
-# Similarly change "host.docker.internal" to "localhost" in the run.yaml file
-#
 services:
-  vllm-0:
+  vllm-inference:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-       - "5100:5100"
+       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=0
+      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-3.1-8B-Instruct
+      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5100
+      --port ${VLLM_INFERENCE_PORT:-5100}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
    deploy:
      resources:
        reservations:
@ -35,25 +30,34 @@ services:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
-  vllm-1:
+
+  # A little trick:
+  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
+  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
+  vllm-${VLLM_SAFETY_MODEL:+safety}:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    # network_mode: "host"
+    network_mode: ${NETWORK_MODE:-bridged}
    ports:
-      - "5101:5101"
+      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
    devices:
      - nvidia.com/gpu=all
    environment:
-      - CUDA_VISIBLE_DEVICES=1
+      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
-      --model meta-llama/Llama-Guard-3-1B
+      --model ${VLLM_SAFETY_MODEL}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
-      --port 5101
+      --port ${VLLM_SAFETY_PORT:-5101}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
    deploy:
      resources:
        reservations:
@ -63,23 +67,25 @@ services:
    runtime: nvidia
  llamastack:
    depends_on:
-    - vllm-0
-    - vllm-1
-      # image: llamastack/distribution-remote-vllm
+      - vllm-inference:
+          condition: service_healthy
+      - vllm-${VLLM_SAFETY_MODEL:+safety}:
+          condition: service_healthy
+    # image: llamastack/distribution-remote-vllm
    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
    volumes:
      - ~/.llama:/root/.llama
-      - ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
-    # network_mode: "host"
+      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
+    network_mode: ${NETWORK_MODE:-bridged}
    environment:
-      - LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1}
-      - LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct}
+      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
+      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
+      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      - MAX_TOKENS=${MAX_TOKENS:-4096}
      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
-      - LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1}
-      - LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
+      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
    ports:
-      - "5001:5001"
+      - "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
    # Hack: wait for vLLM server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
    deploy:
@ -89,6 +95,6 @@ services:
        max_attempts: 5
        window: 60s
 volumes:
-  vllm-0:
-  vllm-1:
+  vllm-inference:
+  vllm-safety:
  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/run-with-safety.yaml
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -1,68 +0,0 @@
-version: '2'
-built_at: '2024-11-11T20:09:45.988375'
-image_name: remote-vllm
-docker_image: remote-vllm
-conda_env: null
-apis:
- inference
- memory
- safety
- agents
- telemetry
-providers:
-  inference:
-  # serves main inference model
-  - provider_id: vllm-0
-    provider_type: remote::vllm
-    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
-      url: ${env.LLAMA_INFERENCE_VLLM_URL:http://host.docker.internal:5100/v1}
-      max_tokens: ${env.MAX_TOKENS:4096}
-      api_token: fake
-  # serves safety llama_guard model
-  - provider_id: vllm-1
-    provider_type: remote::vllm
-    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
-      url: ${env.LLAMA_SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
-      max_tokens: ${env.MAX_TOKENS:4096}
-      api_token: fake
-  memory:
-  - provider_id: faiss-0
-    provider_type: inline::faiss
-    config:
-      kvstore:
-        namespace: null
-        type: sqlite
-        db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/faiss_store.db"
-  safety:
-  - provider_id: llama-guard
-    provider_type: inline::llama-guard
-    config: {}
-  memory:
-  - provider_id: meta0
-    provider_type: inline::faiss
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/agents_store.db"
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-metadata_store:
-  namespace: null
-  type: sqlite
-  db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/registry.db"
-models:
-  - model_id: ${env.LLAMA_INFERENCE_MODEL:Llama3.1-8B-Instruct}
-    provider_id: vllm-0
-  - model_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
-    provider_id: vllm-1
-shields:
-  - shield_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -0,0 +1 @@
+../../llama_stack/templates/remote-vllm/run.yaml
				`@ -0,0 +1 @@`
				`../../llama_stack/templates/remote-vllm/run-with-safety.yaml`